(1) Conduct an exploratory data analysis (EDA) of the TRAIN_SET.CSV. Provide an overview of the data and any underlying patterns you may identify. Without a thorough data dictionary, you may have to make some assumptions about the data. Document any transformations you perform. (10 points)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
#modeling
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
warnings.filterwarnings('ignore')
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import StratifiedShuffleSplit
RS = 15
from collections import Counter
from sklearn.metrics import accuracy_score, log_loss
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,confusion_matrix
data = pd.read_csv('TRAIN_SET.csv') #import data
#data.head(5)
#data.info()
*A good place to start would be col 27 NEXT_INSPECTION_GRADE_C_OR_BELOW since it is the variable we are trying to predict in the following question.
data['NEXT_INSPECTION_GRADE_C_OR_BELOW'].describe(include="0")
count 15633 unique 8 top 0 freq 13143 Name: NEXT_INSPECTION_GRADE_C_OR_BELOW, dtype: object
#sns.distplot(data['NEXT_INSPECTION_GRADE_C_OR_BELOW']); causes an error since there is a string 'goat'
I need to look into the unique values and in general each col starting with col 27
data['NEXT_INSPECTION_GRADE_C_OR_BELOW'].unique()
array(['0', '1', nan, '-3', 'Goat', '7', '9', '4', '3'], dtype=object)
data = data.dropna()
data['INSPECTION_TIME'].head(5)
0 11/5/2013 16:15 1 9/21/2012 13:00 2 9/16/2011 11:50 3 11/22/2011 11:40 5 9/1/2015 17:15 Name: INSPECTION_TIME, dtype: object
data = data.sort_values(by=['INSPECTION_TIME'])
data
| RESTAURANT_SERIAL_NUMBER | RESTAURANT_PERMIT_NUMBER | RESTAURANT_NAME | RESTAURANT_LOCATION | RESTAURANT_CATEGORY | ADDRESS | CITY | STATE | ZIP | CURRENT_DEMERITS | ... | RECORD_UPDATED | LAT_LONG_RAW | FIRST_VIOLATION | SECOND_VIOLATION | THIRD_VIOLATION | FIRST_VIOLATION_TYPE | SECOND_VIOLATION_TYPE | THIRD_VIOLATION_TYPE | NUMBER_OF_VIOLATIONS | NEXT_INSPECTION_GRADE_C_OR_BELOW | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4535 | DA1098976 | PR0023727 | HOOKAH PALACE BAR | HOOKAH PALACE | Bar / Tavern | 1030 E Flamingo Rd | Las Vegas | Nevada | 89119-7436 | 0.0 | ... | 8/29/2013 10:41 | (36.1155934, 115.1414363) | 212.0 | 215.0 | 216.0 | Major | Major | Major | 7 | 0 |
| 7989 | DA0916254 | PR0010827 | MCDONALDS #23038 | MCDONALDS #23038 | Restaurant | 5811 E Charleston Blvd | Las Vegas | Nevada | 89142 | 3.0 | ... | 2/21/2013 22:26 | (36.1589722, 115.0469673) | 202.0 | 211.0 | 215.0 | Critical | Major | Major | 10 | 0 |
| 11689 | DA0915710 | PR0021625 | Bears Best Las Vegas Restaurant | Bears Best Las Vegas Lp | Restaurant | 11111 W Flamingo Rd | Las Vegas | Nevada | 89135-2613 | 8.0 | ... | 2/21/2013 22:26 | (36.120164, 115.335163) | 209.0 | 214.0 | 215.0 | Critical | Major | Major | 8 | 0 |
| 506 | DA0916074 | PR0021103 | MARISCOS EL MARIACHI | MARISCOS EL MARIACHI | Restaurant | 552 N Eastern Ave A | Las Vegas | Nevada | 89101-3488 | 0.0 | ... | 2/21/2013 22:26 | (36.1723554, 115.1160331) | 202.0 | 209.0 | 214.0 | Critical | Critical | Major | 7 | 0 |
| 9739 | DA0915660 | PR0016470 | New China Buffet | New China Buffet | Buffet | 1510 E Flamingo Rd | Las Vegas | Nevada | 89119-5253 | 51.0 | ... | 2/21/2013 22:26 | (36.115612, 115.13276) | 202.0 | 206.0 | 209.0 | Critical | Critical | Critical | 20 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 194 | DA0ODTZQF | PR0018495 | Teriyaki Madness | Teriyaki Madness | Restaurant | 725 W Craig Rd | North Las Vegas | Nevada | 89032-6199 | 9.0 | ... | 9/12/2016 9:59 | (36.239207, 115.153386) | 212.0 | 213.0 | 215.0 | Major | Major | Major | 3 | 0 |
| 15302 | DAK7RVZ6O | PR0114198 | SMASHED PIG - RESTAURANT | SMASHED PIG GASTROPUB, THE | Restaurant | 509 FREMONT | Las Vegas | Nevada | 89101 | 8.0 | ... | 9/9/2016 16:05 | (36.1692635, 115.1406902) | 202.0 | 208.0 | 211.0 | Critical | Critical | Major | 10 | 0 |
| 4709 | DAEOJHXFM | PR0011660 | SOUTH STRIP TRAVELODGE CONT BREAKFAST | SOUTH STRIP TRAVELODGE | Snack Bar | 3735 S Las Vegas Blvd | Las Vegas | Nevada | 89109-4308 | 5.0 | ... | 9/9/2016 9:57 | (36.107315, 115.170976) | 212.0 | 213.0 | 215.0 | Major | Major | Major | 8 | 0 |
| 4180 | DAX8DYZ7Y | PR0021500 | Afandi Restaurant | Afandi Restaurant | Restaurant | 5181 W Charleston Blvd | Las Vegas | Nevada | 89146-1431 | 8.0 | ... | 9/9/2016 16:26 | (36.1587543, 115.2095993) | 208.0 | 209.0 | 216.0 | Critical | Critical | Major | 8 | 0 |
| 6802 | DAYQX02H2 | PR0010829 | MCDONALDS #17690 | MCDONALDS #17690 | Restaurant | 6020 W Tropicana Ave | Las Vegas | Nevada | 89103-4800 | 6.0 | ... | 9/9/2016 10:43 | (36.100462, 115.224075) | 202.0 | 209.0 | 211.0 | Critical | Critical | Major | 6 | 0 |
12083 rows × 28 columns
# from ast import literal_eval
# import folium
# VEGAS_COORDINATES = (36.17, -115.14)
# map = folium.Map(location=VEGAS_COORDINATES, zoom_start=12)
# # add a marker for every record in the filtered data, use a clustered view
# for each in data[0:MAX_RECORDS].iterrows():
# folium.Marker(literal_eval(each[1]["LAT_LONG_RAW"]),
# popup=each[1]["RESTAURANT_NAME"] + " # of Violation: "
# + each[1]["NUMBER_OF_VIOLATIONS"]).add_to(map)
# display(map)
data['CITY'].unique()
array(['Las Vegas', 'North Las Vegas', 'Henderson', 'Boulder City',
'Mesquite', 'Logandale', 'Primm', 'Laughlin', 'Saskatoon',
'Searchlight', 'Overton', 'Moapa', 'New York',
'Truth or Consequences', 'Walla Walla', 'Indian Springs',
'Jellystone Park', 'Jean', 'Blue Diamond', 'Deep Space Nine',
'Sandy Valley', 'Goodsprings', 'Port of Spain', 'HendeSON'],
dtype=object)
data['STATE'].unique()
array(['Nevada', 'SK', 'New York', 'New Mexico', 'Washington', 'Montana',
'Nevada?', 'Star Trek', 'NeVaDa', 'TT'], dtype=object)
data[data['STATE']=='Nevada?'] #ok is in vegas
| RESTAURANT_SERIAL_NUMBER | RESTAURANT_PERMIT_NUMBER | RESTAURANT_NAME | RESTAURANT_LOCATION | RESTAURANT_CATEGORY | ADDRESS | CITY | STATE | ZIP | CURRENT_DEMERITS | ... | RECORD_UPDATED | LAT_LONG_RAW | FIRST_VIOLATION | SECOND_VIOLATION | THIRD_VIOLATION | FIRST_VIOLATION_TYPE | SECOND_VIOLATION_TYPE | THIRD_VIOLATION_TYPE | NUMBER_OF_VIOLATIONS | NEXT_INSPECTION_GRADE_C_OR_BELOW | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 13253 | DA0937174 | PR0021552 | Quiznos Sub | Quiznos Sub | Restaurant | 4985 W Tropicana Ave | Las Vegas | Nevada? | 89103-7078 | 0.0 | ... | 2/21/2013 22:26 | (36.100688, 115.20919) | 209.0 | 212.0 | 215.0 | Critical | Major | Major | 5 | 1 |
1 rows × 28 columns
data[data['STATE']=='NeVaDa']
| RESTAURANT_SERIAL_NUMBER | RESTAURANT_PERMIT_NUMBER | RESTAURANT_NAME | RESTAURANT_LOCATION | RESTAURANT_CATEGORY | ADDRESS | CITY | STATE | ZIP | CURRENT_DEMERITS | ... | RECORD_UPDATED | LAT_LONG_RAW | FIRST_VIOLATION | SECOND_VIOLATION | THIRD_VIOLATION | FIRST_VIOLATION_TYPE | SECOND_VIOLATION_TYPE | THIRD_VIOLATION_TYPE | NUMBER_OF_VIOLATIONS | NEXT_INSPECTION_GRADE_C_OR_BELOW | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 10034 | DA0993030 | PR0015443 | Qdoba Mexican Grill | QDOBA #531 | Restaurant | 6965 S RAINBOW 125 | Las Vegas | NeVaDa | 89113 | 0.0 | ... | 2/21/2013 22:26 | (36.0621528, 115.243379) | 214.0 | 230.0 | 233.0 | Major | Non-Major | Non-Major | 3 | 0 |
1 rows × 28 columns
#for fun check TT
data[data['STATE']=='TT']
| RESTAURANT_SERIAL_NUMBER | RESTAURANT_PERMIT_NUMBER | RESTAURANT_NAME | RESTAURANT_LOCATION | RESTAURANT_CATEGORY | ADDRESS | CITY | STATE | ZIP | CURRENT_DEMERITS | ... | RECORD_UPDATED | LAT_LONG_RAW | FIRST_VIOLATION | SECOND_VIOLATION | THIRD_VIOLATION | FIRST_VIOLATION_TYPE | SECOND_VIOLATION_TYPE | THIRD_VIOLATION_TYPE | NUMBER_OF_VIOLATIONS | NEXT_INSPECTION_GRADE_C_OR_BELOW | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 8299 | DA1086357 | PR0099062 | 1900 ASIAN CUISINE | 1900 ASIAN CUISINE | Restaurant | 5115 Spring Mountain Rd 103 | Port of Spain | TT | 99999 | 0.0 | ... | 7/12/2013 9:23 | (36.1255374, 115.2112409) | 202.0 | 209.0 | 211.0 | Critical | Critical | Major | 9 | 0 |
1 rows × 28 columns
I am really just interested in las vegas restaurants since this is alas vegas related dataset
indexA = data[ data['STATE'] == 'Nevada'].index
print(len(indexA))
indexB = data[ data['STATE'] == 'Nevada?'].index
print(len(indexB))
indexC = data[ data['STATE'] == 'NeVaDa'].index
print(len(indexC))
#if i combine these lists of index values i will have a good start in terms of cleaning data
12063 1 1
indexNEVADA = list(indexC)+list(indexB)+list(indexA)
list(indexC)
[10034]
df=data.loc[indexNEVADA]
df
| RESTAURANT_SERIAL_NUMBER | RESTAURANT_PERMIT_NUMBER | RESTAURANT_NAME | RESTAURANT_LOCATION | RESTAURANT_CATEGORY | ADDRESS | CITY | STATE | ZIP | CURRENT_DEMERITS | ... | RECORD_UPDATED | LAT_LONG_RAW | FIRST_VIOLATION | SECOND_VIOLATION | THIRD_VIOLATION | FIRST_VIOLATION_TYPE | SECOND_VIOLATION_TYPE | THIRD_VIOLATION_TYPE | NUMBER_OF_VIOLATIONS | NEXT_INSPECTION_GRADE_C_OR_BELOW | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 10034 | DA0993030 | PR0015443 | Qdoba Mexican Grill | QDOBA #531 | Restaurant | 6965 S RAINBOW 125 | Las Vegas | NeVaDa | 89113 | 0.0 | ... | 2/21/2013 22:26 | (36.0621528, 115.243379) | 214.0 | 230.0 | 233.0 | Major | Non-Major | Non-Major | 3 | 0 |
| 13253 | DA0937174 | PR0021552 | Quiznos Sub | Quiznos Sub | Restaurant | 4985 W Tropicana Ave | Las Vegas | Nevada? | 89103-7078 | 0.0 | ... | 2/21/2013 22:26 | (36.100688, 115.20919) | 209.0 | 212.0 | 215.0 | Critical | Major | Major | 5 | 1 |
| 4535 | DA1098976 | PR0023727 | HOOKAH PALACE BAR | HOOKAH PALACE | Bar / Tavern | 1030 E Flamingo Rd | Las Vegas | Nevada | 89119-7436 | 0.0 | ... | 8/29/2013 10:41 | (36.1155934, 115.1414363) | 212.0 | 215.0 | 216.0 | Major | Major | Major | 7 | 0 |
| 7989 | DA0916254 | PR0010827 | MCDONALDS #23038 | MCDONALDS #23038 | Restaurant | 5811 E Charleston Blvd | Las Vegas | Nevada | 89142 | 3.0 | ... | 2/21/2013 22:26 | (36.1589722, 115.0469673) | 202.0 | 211.0 | 215.0 | Critical | Major | Major | 10 | 0 |
| 11689 | DA0915710 | PR0021625 | Bears Best Las Vegas Restaurant | Bears Best Las Vegas Lp | Restaurant | 11111 W Flamingo Rd | Las Vegas | Nevada | 89135-2613 | 8.0 | ... | 2/21/2013 22:26 | (36.120164, 115.335163) | 209.0 | 214.0 | 215.0 | Critical | Major | Major | 8 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 194 | DA0ODTZQF | PR0018495 | Teriyaki Madness | Teriyaki Madness | Restaurant | 725 W Craig Rd | North Las Vegas | Nevada | 89032-6199 | 9.0 | ... | 9/12/2016 9:59 | (36.239207, 115.153386) | 212.0 | 213.0 | 215.0 | Major | Major | Major | 3 | 0 |
| 15302 | DAK7RVZ6O | PR0114198 | SMASHED PIG - RESTAURANT | SMASHED PIG GASTROPUB, THE | Restaurant | 509 FREMONT | Las Vegas | Nevada | 89101 | 8.0 | ... | 9/9/2016 16:05 | (36.1692635, 115.1406902) | 202.0 | 208.0 | 211.0 | Critical | Critical | Major | 10 | 0 |
| 4709 | DAEOJHXFM | PR0011660 | SOUTH STRIP TRAVELODGE CONT BREAKFAST | SOUTH STRIP TRAVELODGE | Snack Bar | 3735 S Las Vegas Blvd | Las Vegas | Nevada | 89109-4308 | 5.0 | ... | 9/9/2016 9:57 | (36.107315, 115.170976) | 212.0 | 213.0 | 215.0 | Major | Major | Major | 8 | 0 |
| 4180 | DAX8DYZ7Y | PR0021500 | Afandi Restaurant | Afandi Restaurant | Restaurant | 5181 W Charleston Blvd | Las Vegas | Nevada | 89146-1431 | 8.0 | ... | 9/9/2016 16:26 | (36.1587543, 115.2095993) | 208.0 | 209.0 | 216.0 | Critical | Critical | Major | 8 | 0 |
| 6802 | DAYQX02H2 | PR0010829 | MCDONALDS #17690 | MCDONALDS #17690 | Restaurant | 6020 W Tropicana Ave | Las Vegas | Nevada | 89103-4800 | 6.0 | ... | 9/9/2016 10:43 | (36.100462, 115.224075) | 202.0 | 209.0 | 211.0 | Critical | Critical | Major | 6 | 0 |
12065 rows × 28 columns
#A test to see if the state wise filtering helped with removing outliers for cities
df['CITY'].unique()
array(['Las Vegas', 'North Las Vegas', 'Henderson', 'Boulder City',
'Mesquite', 'Logandale', 'Primm', 'Laughlin', 'Searchlight',
'Overton', 'Moapa', 'Indian Springs', 'Jean', 'Blue Diamond',
'Sandy Valley', 'Goodsprings', 'HendeSON'], dtype=object)
# I AM NOT FAMILIAR WITH LAS VEGAS SOO LET ME QUICKLY GOOGLE SEARCH IF THESE CITIES EXIST IN THE STATE OF NEVADA
#HENDERSON, CHECK
#Bouldercity, CHECK
#Mesquite, CHECK
#Logandale, CHECK
#Primm, CHECK
#Laughlin, CHECK
#Searchlight, CHECK
#Overton, CHECK
#Moapa, CHECK
#Springs, CHECK
#Jean, CHECK
#BLUE DIAMOND, CHECK
#SANDY VALLEY, CHECK
#Goodsprings, CHECK
#HendeSON, NOT CHECK **But probably meant to say 'Henderson'
df[df['CITY']=='HendeSON'] #need to fix this City name
| RESTAURANT_SERIAL_NUMBER | RESTAURANT_PERMIT_NUMBER | RESTAURANT_NAME | RESTAURANT_LOCATION | RESTAURANT_CATEGORY | ADDRESS | CITY | STATE | ZIP | CURRENT_DEMERITS | ... | RECORD_UPDATED | LAT_LONG_RAW | FIRST_VIOLATION | SECOND_VIOLATION | THIRD_VIOLATION | FIRST_VIOLATION_TYPE | SECOND_VIOLATION_TYPE | THIRD_VIOLATION_TYPE | NUMBER_OF_VIOLATIONS | NEXT_INSPECTION_GRADE_C_OR_BELOW | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 11024 | DA0832198 | PR0023618 | Jimmy Johns | JIMMY JOHNS | Restaurant | 310 W LAKE MEAD Pkwy | HendeSON | Nevada | 89015 | 5.0 | ... | 2/21/2013 22:26 | (36.0373812, 114.9923788) | 214.0 | 218.0 | 226.0 | Major | Major | Non-Major | 5 | 0 |
1 rows × 28 columns
df.at[11024,'CITY']='Henderson'
df.loc[11024] #the city is fixed
RESTAURANT_SERIAL_NUMBER DA0832198 RESTAURANT_PERMIT_NUMBER PR0023618 RESTAURANT_NAME Jimmy Johns RESTAURANT_LOCATION JIMMY JOHNS RESTAURANT_CATEGORY Restaurant ADDRESS 310 W LAKE MEAD Pkwy CITY Henderson STATE Nevada ZIP 89015 CURRENT_DEMERITS 5 CURRENT_GRADE A EMPLOYEE_COUNT 19 MEDIAN_EMPLOYEE_AGE 32.0575 MEDIAN_EMPLOYEE_TENURE 3.91526 INSPECTION_TIME 7/22/2011 11:55 INSPECTION_TYPE Routine Inspection INSPECTION_DEMERITS 9 VIOLATIONS_RAW 214,218,226,231,233 RECORD_UPDATED 2/21/2013 22:26 LAT_LONG_RAW (36.0373812, 114.9923788) FIRST_VIOLATION 214 SECOND_VIOLATION 218 THIRD_VIOLATION 226 FIRST_VIOLATION_TYPE Major SECOND_VIOLATION_TYPE Major THIRD_VIOLATION_TYPE Non-Major NUMBER_OF_VIOLATIONS 5 NEXT_INSPECTION_GRADE_C_OR_BELOW 0 Name: 11024, dtype: object
#double confirmation since im paranoid
df['CITY'].unique() #HendeSON is gone
array(['Las Vegas', 'North Las Vegas', 'Henderson', 'Boulder City',
'Mesquite', 'Logandale', 'Primm', 'Laughlin', 'Searchlight',
'Overton', 'Moapa', 'Indian Springs', 'Jean', 'Blue Diamond',
'Sandy Valley', 'Goodsprings'], dtype=object)
df.columns
Index(['RESTAURANT_SERIAL_NUMBER', 'RESTAURANT_PERMIT_NUMBER',
'RESTAURANT_NAME', 'RESTAURANT_LOCATION', 'RESTAURANT_CATEGORY',
'ADDRESS', 'CITY', 'STATE', 'ZIP', 'CURRENT_DEMERITS', 'CURRENT_GRADE',
'EMPLOYEE_COUNT', 'MEDIAN_EMPLOYEE_AGE', 'MEDIAN_EMPLOYEE_TENURE',
'INSPECTION_TIME', 'INSPECTION_TYPE', 'INSPECTION_DEMERITS',
'VIOLATIONS_RAW', 'RECORD_UPDATED', 'LAT_LONG_RAW', 'FIRST_VIOLATION',
'SECOND_VIOLATION', 'THIRD_VIOLATION', 'FIRST_VIOLATION_TYPE',
'SECOND_VIOLATION_TYPE', 'THIRD_VIOLATION_TYPE', 'NUMBER_OF_VIOLATIONS',
'NEXT_INSPECTION_GRADE_C_OR_BELOW'],
dtype='object')
SLOWLY BUT SURELY MAKE MY WAY DOWN EVERY FIELD AND CHECK FOR IRREGULARITIES
#if every restaurant serial number is different the unique values should == the length of the df
len(df['RESTAURANT_SERIAL_NUMBER'].unique()) == len(df)
True
len(df['RESTAURANT_PERMIT_NUMBER'].unique()) == len(df)
False
df['RESTAURANT_PERMIT_NUMBER'].unique() #my first guess is that chain restaurants like mcdonalds possibly have the same permit number
array(['PR0015443', 'PR0021552', 'PR0023727', ..., 'PR0021318',
'PR0004169', 'PR0011660'], dtype=object)
duplicateRowsDF = df[df.duplicated(['RESTAURANT_PERMIT_NUMBER'],keep='first')]
duplicateRowsDF #there seems to be alot of duplicated values
| RESTAURANT_SERIAL_NUMBER | RESTAURANT_PERMIT_NUMBER | RESTAURANT_NAME | RESTAURANT_LOCATION | RESTAURANT_CATEGORY | ADDRESS | CITY | STATE | ZIP | CURRENT_DEMERITS | ... | RECORD_UPDATED | LAT_LONG_RAW | FIRST_VIOLATION | SECOND_VIOLATION | THIRD_VIOLATION | FIRST_VIOLATION_TYPE | SECOND_VIOLATION_TYPE | THIRD_VIOLATION_TYPE | NUMBER_OF_VIOLATIONS | NEXT_INSPECTION_GRADE_C_OR_BELOW | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1978 | DA0917165 | PR0013351 | Japan Cafe | Japan Cafe | Restaurant | 755 S Grand Central Pkwy | Las Vegas | Nevada | 89106-4524 | 9.0 | ... | 2/21/2013 22:26 | (36.1652351, 115.1557538) | 211.0 | 212.0 | 215.0 | Major | Major | Major | 6 | 1 |
| 12726 | DA1038289 | PR0017360 | China Olive Restaurant | China Olive | Restaurant | 151 N Nellis Blvd | Las Vegas | Nevada | 89110-5320 | 14.0 | ... | 2/21/2013 22:26 | (36.1627045, 115.0623846) | 203.0 | 206.0 | 211.0 | Critical | Critical | Major | 10 | 1 |
| 6938 | DA0920814 | PR0023727 | HOOKAH PALACE BAR | HOOKAH PALACE | Bar / Tavern | 1030 E Flamingo Rd | Las Vegas | Nevada | 89119-7436 | 0.0 | ... | 2/21/2013 22:26 | (36.1155934, 115.1414363) | 222.0 | 223.0 | 227.0 | Major | Major | Non-Major | 5 | 1 |
| 1971 | DA1039083 | PR0021760 | Baja Fresh #73 | Baja Fresh #73 | Restaurant | 1380 E Flamingo Rd | Las Vegas | Nevada | 89119-5252 | 6.0 | ... | 2/21/2013 22:26 | (36.1146707, 115.1355152) | 211.0 | 214.0 | 215.0 | Major | Major | Major | 11 | 0 |
| 5250 | DA1039128 | PR0101085 | BUBBA GUMP BAR | BUBBA GUMP SHRIMP CO @ HARMON CENTER | Bar / Tavern | 3717 S LAS VEGAS Blvd 2Q/3C | Las Vegas | Nevada | 89109 | 8.0 | ... | 2/21/2013 22:26 | (36.108719, 115.1721291) | 202.0 | 204.0 | 212.0 | Critical | Critical | Major | 4 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7087 | DAF0FSQLZ | PR0019038 | Mt Everest Indias Cuisine | Mt Everest Indias Cuisine | Restaurant | 3641 W Sahara Ave | Las Vegas | Nevada | 89102-5870 | 9.0 | ... | 9/9/2016 12:47 | (36.144199, 115.18992) | 202.0 | 208.0 | 209.0 | Critical | Critical | Critical | 13 | 1 |
| 194 | DA0ODTZQF | PR0018495 | Teriyaki Madness | Teriyaki Madness | Restaurant | 725 W Craig Rd | North Las Vegas | Nevada | 89032-6199 | 9.0 | ... | 9/12/2016 9:59 | (36.239207, 115.153386) | 212.0 | 213.0 | 215.0 | Major | Major | Major | 3 | 0 |
| 15302 | DAK7RVZ6O | PR0114198 | SMASHED PIG - RESTAURANT | SMASHED PIG GASTROPUB, THE | Restaurant | 509 FREMONT | Las Vegas | Nevada | 89101 | 8.0 | ... | 9/9/2016 16:05 | (36.1692635, 115.1406902) | 202.0 | 208.0 | 211.0 | Critical | Critical | Major | 10 | 0 |
| 4180 | DAX8DYZ7Y | PR0021500 | Afandi Restaurant | Afandi Restaurant | Restaurant | 5181 W Charleston Blvd | Las Vegas | Nevada | 89146-1431 | 8.0 | ... | 9/9/2016 16:26 | (36.1587543, 115.2095993) | 208.0 | 209.0 | 216.0 | Critical | Critical | Major | 8 | 0 |
| 6802 | DAYQX02H2 | PR0010829 | MCDONALDS #17690 | MCDONALDS #17690 | Restaurant | 6020 W Tropicana Ave | Las Vegas | Nevada | 89103-4800 | 6.0 | ... | 9/9/2016 10:43 | (36.100462, 115.224075) | 202.0 | 209.0 | 211.0 | Critical | Critical | Major | 6 | 0 |
4660 rows × 28 columns
#in general how many duplicate values almost half mmmh
test=(df[df['RESTAURANT_PERMIT_NUMBER']=='PR0013351'])
test.INSPECTION_TIME
6352 1/10/2012 14:50 1978 1/11/2012 9:15 11814 5/21/2010 11:20 Name: INSPECTION_TIME, dtype: object
MOVING ON TO THE NEXT COL
len(df['RESTAURANT_NAME'].unique()) #may have to transform data so mcdonalds and mac donalds would be the same thing.....if it comes down to it
7143
df['RESTAURANT_CATEGORY'].unique() #ok all these seem food related
array(['Restaurant', 'Bar / Tavern', 'Buffet', 'Special Kitchen',
'Snack Bar', 'Bakery Sales', 'Confection',
'Food Trucks / Mobile Vendor', 'Caterer', 'Portable Unit',
'Kitchen Bakery', 'Portable Bar', 'Childcare Kitchens',
'Banquet Kitchen', 'Pantry', 'Barbeque', 'Banquet Support',
'Garde Manger', 'Vegetable Prep', 'Grocery Store Sampling',
'Elementary School Kitchen', 'Institutional Food Service',
'Meat/Poultry/Seafood', 'Concessions', 'Main Kitchen',
'Produce Market', 'Farmers Market', 'Self-Service Food Truck'],
dtype=object)
ZIP = df['ZIP'].unique()
ZIP #need to check if everything in this array starts with the digit 8
res = all(str(i)[0] == str(ZIP[0])[0] for i in ZIP)
res #there is a zipcode that does not start with 8
False
df['ZIP'] = df['ZIP'].str[:5]
df['ZIP'].unique() #91301 is a california Area code
array(['89113', '89103', '89119', '89142', '89135', '89101', '89030',
'89109', '89002', '89115', '89106', '89104', '89148', '89111',
'89032', '89169', '89123', '89074', '89146', '89110', '89130',
'89005', '89015', '89147', '89120', '89121', '89102', '89139',
'89027', '89052', '89014', '89149', '89118', '89012', '89189',
'89183', '89021', '89019', '89179', '89131', '89117', '89145',
'89029', '89128', '89084', '89031', '89129', '89141', '89081',
'89107', '89011', '89134', '89108', '89156', '89158', '89046',
'89122', '89127', '89140', '89044', '89114', '89040', '89025',
'89143', '89124', '89086', '89157', '89178', '89138', '89036',
'89163', '89018', '89161', '89144', '89034', '89004', '89186',
'89165', '89105', '89085', '89153', '91301', '89166', '89159',
'89154'], dtype=object)
df[df['ZIP']=='91301']
#https://www.yelp.com/biz/the-cheesecake-factory-las-vegas-las-vegas
#this place exists but should have the zip '89106'
| RESTAURANT_SERIAL_NUMBER | RESTAURANT_PERMIT_NUMBER | RESTAURANT_NAME | RESTAURANT_LOCATION | RESTAURANT_CATEGORY | ADDRESS | CITY | STATE | ZIP | CURRENT_DEMERITS | ... | RECORD_UPDATED | LAT_LONG_RAW | FIRST_VIOLATION | SECOND_VIOLATION | THIRD_VIOLATION | FIRST_VIOLATION_TYPE | SECOND_VIOLATION_TYPE | THIRD_VIOLATION_TYPE | NUMBER_OF_VIOLATIONS | NEXT_INSPECTION_GRADE_C_OR_BELOW | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 6454 | DADC0NAML | PR0113304 | CHEESECAKE FACTORY COOKLINE | CHEESECAKE FACTORY @ PREMIUM OUTLET MALL | Special Kitchen | 505 GRAND CENTRAL 3201 | Las Vegas | Nevada | 91301 | 6.0 | ... | 6/29/2016 15:13 | (36.1661416, 115.1564436) | 202.0 | 209.0 | 211.0 | Critical | Critical | Major | 5 | 0 |
| 3499 | DA1522195 | PR0113303 | CHEESECAKE FACTORY RESTAURANT | CHEESECAKE FACTORY @ PREMIUM OUTLET MALL | Restaurant | 505 GRAND CENTRAL 3201 | Las Vegas | Nevada | 91301 | 5.0 | ... | 6/12/2015 15:36 | (36.1661416, 115.1564436) | 202.0 | 211.0 | 215.0 | Critical | Major | Major | 6 | 0 |
2 rows × 28 columns
df.at[6454,'ZIP']='89106'
df.at[3499,'ZIP']='89106'
df[df['ZIP']=='91301'] #SHOULD COME UP EMPTY
res = all(str(i)[0] == str(df.ZIP[0])[0] for i in df.ZIP)
res #
True
ONTO THE NEXT FIELD
all(isinstance(x, (int, float)) for x in df['CURRENT_DEMERITS'])
#CURRENT_DEMERITS seems to be ok
True
df.CURRENT_GRADE.unique() #there are alot of grades that dont exist here
#The Restaurant Inspections from the Southern Nevada Health District (SNHD) uses the following grades; A,B,C,X
#However, the only grades used by food grading inspectors are A-C, https://www1.nyc.gov/site/doh/services/restaurant-grades.page
array(['A', 'X', 'O', 'B', 'N', 'C', 'A+', 'K', '.\\<><1@#&|'],
dtype=object)
#df[df['CURRENT_GRADE']=='.\\<><1@#&|']
#df[df['CURRENT_GRADE']=='K']
df[df['CURRENT_GRADE']=='A+']
| RESTAURANT_SERIAL_NUMBER | RESTAURANT_PERMIT_NUMBER | RESTAURANT_NAME | RESTAURANT_LOCATION | RESTAURANT_CATEGORY | ADDRESS | CITY | STATE | ZIP | CURRENT_DEMERITS | ... | RECORD_UPDATED | LAT_LONG_RAW | FIRST_VIOLATION | SECOND_VIOLATION | THIRD_VIOLATION | FIRST_VIOLATION_TYPE | SECOND_VIOLATION_TYPE | THIRD_VIOLATION_TYPE | NUMBER_OF_VIOLATIONS | NEXT_INSPECTION_GRADE_C_OR_BELOW | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1900 | DA0575300 | PR0017094 | Dairy Queen #17798 | Dairy Queen #17798 | Restaurant | 6125 Spring Mountain Rd | Las Vegas | Nevada | 89146 | 10.0 | ... | 2/21/2013 22:26 | (36.1250311, 115.2256202) | 9.0 | 14.0 | 19.0 | Non-Major | Kitchen Nightmares | Non-Major | 4 | 0 |
1 rows × 28 columns
df.at[1900,'CURRENT_GRADE']='A'
keepVALUES = ['A','B','C']
boolean_series = df['CURRENT_GRADE'].isin(keepVALUES)
filtered_df = df[boolean_series]
filtered_df=df
df.at[10034,'STATE']='Nevada'
df.at[13253,'STATE']='Nevada'
NEXT FIELD, EMPLOYEE_COUNT
all(isinstance(x, (int, float)) for x in df['EMPLOYEE_COUNT'])
True
NEXT FIELD, MEDIAN_EMPLOYEE_AGE
all(isinstance(x, (int, float)) for x in df['MEDIAN_EMPLOYEE_AGE'])
True
NEXT FIELD, MEDIAN_EMPLOYEE_TENURE
all(isinstance(x, (int, float)) for x in df['MEDIAN_EMPLOYEE_TENURE'])
True
NEXT FIELD, INSPECTION_TIME
#df.INSPECTION_TIME
#there will be wrong time, the best idea i have is to filter the df by choronologicall order and remove the obvious outliers
first_column = df.pop('INSPECTION_TIME')
df.insert(0, 'INSPECTION_TIME', first_column)
df = df.sort_values(by='INSPECTION_TIME')
df = df.iloc[1: , :]
#the first row was inspected in 1900 which didnt take place I will have deleted this row
NEXT FIELD, INSPECTION_DEMERITS
all(isinstance(x, (int, float)) for x in df['INSPECTION_DEMERITS'])
False
df.INSPECTION_DEMERITS.unique() # PAY ATTENTION TO 'Routine Inspection'
df[df.INSPECTION_DEMERITS=='Routine Inspection'] #this record also has outlier for record_updated should be safe to del
| INSPECTION_TIME | RESTAURANT_SERIAL_NUMBER | RESTAURANT_PERMIT_NUMBER | RESTAURANT_NAME | RESTAURANT_LOCATION | RESTAURANT_CATEGORY | ADDRESS | CITY | STATE | ZIP | ... | RECORD_UPDATED | LAT_LONG_RAW | FIRST_VIOLATION | SECOND_VIOLATION | THIRD_VIOLATION | FIRST_VIOLATION_TYPE | SECOND_VIOLATION_TYPE | THIRD_VIOLATION_TYPE | NUMBER_OF_VIOLATIONS | NEXT_INSPECTION_GRADE_C_OR_BELOW | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 13851 | 9.708333896 | DA0872282 | PR0021555 | Quiznos Sub #1397 | Quiznos Sub | Restaurant | 3318 E Flamingo Rd | Las Vegas | Nevada | 89121 | ... | 1/14/1900 0:00 | (36.11539, 115.102781) | 209.0 | 213.0 | 217.0 | Critical | Major | Major | 5 | 0 |
1 rows × 28 columns
df = df.drop(13851)
#df
NEXT FIELD, VIOLATIONS_RAW
all(isinstance(x, (int, float)) for x in df['VIOLATIONS_RAW'])
#all(isinstance(x, (int, float)) for x in df['VIOLATIONS_RAW']) is true meaning there is a string somewhere in data
False
df2=[]
for each in df.VIOLATIONS_RAW:
string_list = each.split(",")
each = list(filter(lambda string: string != '000', string_list))
a =each
df2.append(a)
df.VIOLATIONS_RAW = df2
test2=[]
for each in df.VIOLATIONS_RAW:
a=(any(c.isalpha() for c in each))
test2.append(a)
[i for i, x in enumerate(test2) if x]
#there is no true values, but there are 6 blanks
[]
[i for i, item in enumerate(df.VIOLATIONS_RAW) if item is None]
[]
[i for i, item in enumerate(df.VIOLATIONS_RAW) if item is '']
[]
[i for i, item in enumerate(df.VIOLATIONS_RAW) if item is '[]']
[]
NEXT FIELD, RECORD_UPDATED
#df3 = df.sort_values(by='RECORD_UPDATED') record updated seems to be fine
NEXT FIELD, LAT_LONG_RAW
FIRST_VIOLATION + SECOND_VIOLATION + THIRD_VIOLATION
print(all(isinstance(x, (int, float)) for x in df['FIRST_VIOLATION']))
print(all(isinstance(x, (int, float)) for x in df['SECOND_VIOLATION']))
print(all(isinstance(x, (int, float)) for x in df['THIRD_VIOLATION']))
True True True
NEXT FIELD, FIRST_VIOLATION_TYPE + SECOND_VIOLATION_TYPE + THIRD_VIOLATION_TYPE
print(df.FIRST_VIOLATION_TYPE.unique() )
print('-')
print(df.SECOND_VIOLATION_TYPE.unique() )
#im not sure Kitchen Nightmares and Supercritical are violation types if there is only 1 row I will delete it
print('-')
print(df.THIRD_VIOLATION_TYPE.unique() )
['Critical' 'Major' 'Non-Major' 'Imminent Health Hazard'] - ['Major' 'Critical' 'Non-Major' 'Imminent Health Hazard' 'Kitchen Nightmares' 'Supercritical'] - ['Major' 'Critical' 'Non-Major' 'Imminent Health Hazard']
df[df.SECOND_VIOLATION_TYPE=='Kitchen Nightmares']
| INSPECTION_TIME | RESTAURANT_SERIAL_NUMBER | RESTAURANT_PERMIT_NUMBER | RESTAURANT_NAME | RESTAURANT_LOCATION | RESTAURANT_CATEGORY | ADDRESS | CITY | STATE | ZIP | ... | RECORD_UPDATED | LAT_LONG_RAW | FIRST_VIOLATION | SECOND_VIOLATION | THIRD_VIOLATION | FIRST_VIOLATION_TYPE | SECOND_VIOLATION_TYPE | THIRD_VIOLATION_TYPE | NUMBER_OF_VIOLATIONS | NEXT_INSPECTION_GRADE_C_OR_BELOW | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1900 | 3/2/2010 15:30 | DA0575300 | PR0017094 | Dairy Queen #17798 | Dairy Queen #17798 | Restaurant | 6125 Spring Mountain Rd | Las Vegas | Nevada | 89146 | ... | 2/21/2013 22:26 | (36.1250311, 115.2256202) | 9.0 | 14.0 | 19.0 | Non-Major | Kitchen Nightmares | Non-Major | 4 | 0 |
1 rows × 28 columns
df[df.SECOND_VIOLATION_TYPE=='Supercritical']
| INSPECTION_TIME | RESTAURANT_SERIAL_NUMBER | RESTAURANT_PERMIT_NUMBER | RESTAURANT_NAME | RESTAURANT_LOCATION | RESTAURANT_CATEGORY | ADDRESS | CITY | STATE | ZIP | ... | RECORD_UPDATED | LAT_LONG_RAW | FIRST_VIOLATION | SECOND_VIOLATION | THIRD_VIOLATION | FIRST_VIOLATION_TYPE | SECOND_VIOLATION_TYPE | THIRD_VIOLATION_TYPE | NUMBER_OF_VIOLATIONS | NEXT_INSPECTION_GRADE_C_OR_BELOW | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 307 | 4/20/2010 14:45 | DA0305743 | PR0007180 | Jack in the Box #7264 | Jack in the Box #7264 | Restaurant | 6345 Losee Rd | North Las Vegas | Nevada | 89081 | ... | 2/21/2013 22:26 | (10.6918, 61.2225) | 4.0 | 12.0 | 19.0 | Major | Supercritical | Non-Major | 8 | 0 |
1 rows × 28 columns
df.at[1900,'SECOND_VIOLATION_TYPE']='Imminent Health Hazard'
df.at[307,'SECOND_VIOLATION_TYPE']='Imminent Health Hazard'
#changing the crazy outliers to fit situation better
print(df.SECOND_VIOLATION_TYPE.unique() )
#test to see if changes took place
['Major' 'Critical' 'Non-Major' 'Imminent Health Hazard']
NEXT FIELD, NUMBER_OF_VIOLATIONS
print(all(isinstance(x, (int)) for x in df['NUMBER_OF_VIOLATIONS']))
print(df.NUMBER_OF_VIOLATIONS.unique() )
False ['10' '8' '7' '20' '6' '4' '3' '23' '5' '9' '13' '18' '11' '14' '15' '12' '16' '17' '30' '25' '19' '28' '42' '21' '22' '24']
NEXT FIELD, NEXT_INSPECTION_GRADE_C_OR_BELOW
print(df.NEXT_INSPECTION_GRADE_C_OR_BELOW.unique() )
#the correct outputs should be 0 and 1
['0' '1' '7' '-3' 'Goat']
df[df.NEXT_INSPECTION_GRADE_C_OR_BELOW=='Goat']
| INSPECTION_TIME | RESTAURANT_SERIAL_NUMBER | RESTAURANT_PERMIT_NUMBER | RESTAURANT_NAME | RESTAURANT_LOCATION | RESTAURANT_CATEGORY | ADDRESS | CITY | STATE | ZIP | ... | RECORD_UPDATED | LAT_LONG_RAW | FIRST_VIOLATION | SECOND_VIOLATION | THIRD_VIOLATION | FIRST_VIOLATION_TYPE | SECOND_VIOLATION_TYPE | THIRD_VIOLATION_TYPE | NUMBER_OF_VIOLATIONS | NEXT_INSPECTION_GRADE_C_OR_BELOW | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1622 | 6/17/2010 8:20 | DA0508771 | PR0014513 | Plaza Hotel Downtown Grill | Plaza Hotel & Casino | Snack Bar | 1 S Main St | Las Vegas | Nevada | 89101 | ... | 2/21/2013 22:26 | (36.1720776, 115.1468298) | 4.0 | 14.0 | 27.0 | Major | Major | Non-Major | 4 | Goat |
1 rows × 28 columns
df[df.NEXT_INSPECTION_GRADE_C_OR_BELOW=='-3']
| INSPECTION_TIME | RESTAURANT_SERIAL_NUMBER | RESTAURANT_PERMIT_NUMBER | RESTAURANT_NAME | RESTAURANT_LOCATION | RESTAURANT_CATEGORY | ADDRESS | CITY | STATE | ZIP | ... | RECORD_UPDATED | LAT_LONG_RAW | FIRST_VIOLATION | SECOND_VIOLATION | THIRD_VIOLATION | FIRST_VIOLATION_TYPE | SECOND_VIOLATION_TYPE | THIRD_VIOLATION_TYPE | NUMBER_OF_VIOLATIONS | NEXT_INSPECTION_GRADE_C_OR_BELOW | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 546 | 4/15/2010 12:15 | DA0630686 | PR0023750 | Pizza Hut #026623 | Pizza Hut | Restaurant | 10260 W Charleston Blvd | Las Vegas | Nevada | 89135 | ... | 2/21/2013 22:26 | (36.159506, 115.319453) | 31.0 | 32.0 | 112.0 | Non-Major | Non-Major | Major | 4 | -3 |
1 rows × 28 columns
df[df.NEXT_INSPECTION_GRADE_C_OR_BELOW=='7']
| INSPECTION_TIME | RESTAURANT_SERIAL_NUMBER | RESTAURANT_PERMIT_NUMBER | RESTAURANT_NAME | RESTAURANT_LOCATION | RESTAURANT_CATEGORY | ADDRESS | CITY | STATE | ZIP | ... | RECORD_UPDATED | LAT_LONG_RAW | FIRST_VIOLATION | SECOND_VIOLATION | THIRD_VIOLATION | FIRST_VIOLATION_TYPE | SECOND_VIOLATION_TYPE | THIRD_VIOLATION_TYPE | NUMBER_OF_VIOLATIONS | NEXT_INSPECTION_GRADE_C_OR_BELOW | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4885 | 2/4/2013 14:15 | DA1043447 | PR0021555 | Quiznos Sub #1397 | Quiznos Sub | Restaurant | 3318 E Flamingo Rd | Las Vegas | Nevada | 89121 | ... | 2/21/2013 22:26 | (36.11539, 115.102781) | 202.0 | 209.0 | 211.0 | Critical | Critical | Major | 10 | 7 |
1 rows × 28 columns
KEEPING IN MIND INDEX 1622, 546, 4885
df = df.drop(1622)
df[df.NEXT_INSPECTION_GRADE_C_OR_BELOW=='Goat']
df = df.drop(546)
df[df.NEXT_INSPECTION_GRADE_C_OR_BELOW=='-3']
df = df.drop(4885)
print(df.NEXT_INSPECTION_GRADE_C_OR_BELOW.unique() )
['0' '1']
END OF PRELIMINARY CLEANING, NOW OFF TO EDA
df.describe()
| CURRENT_DEMERITS | EMPLOYEE_COUNT | MEDIAN_EMPLOYEE_AGE | MEDIAN_EMPLOYEE_TENURE | FIRST_VIOLATION | SECOND_VIOLATION | THIRD_VIOLATION | |
|---|---|---|---|---|---|---|---|
| count | 12060.000000 | 12060.000000 | 12060.000000 | 12060.000000 | 12060.000000 | 12060.000000 | 12060.000000 |
| mean | 5.037811 | 24.331095 | 28.032925 | 4.039101 | 165.102156 | 171.846269 | 179.234411 |
| std | 5.165707 | 1014.753892 | 4.878642 | 2.016693 | 81.412748 | 80.147731 | 77.450620 |
| min | 0.000000 | -7.000000 | 18.000000 | 0.227197 | 1.000000 | 2.000000 | 4.000000 |
| 25% | 0.000000 | 8.000000 | 24.650072 | 2.556196 | 202.000000 | 204.000000 | 209.000000 |
| 50% | 5.000000 | 14.000000 | 28.002760 | 3.704378 | 206.000000 | 211.000000 | 215.000000 |
| 75% | 8.000000 | 21.000000 | 31.426535 | 5.163998 | 211.000000 | 215.000000 | 227.000000 |
| max | 100.000000 | 111447.000000 | 49.614076 | 16.433564 | 301.000000 | 302.000000 | 310.000000 |
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 12060 entries, 7989 to 6802 Data columns (total 28 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 INSPECTION_TIME 12060 non-null object 1 RESTAURANT_SERIAL_NUMBER 12060 non-null object 2 RESTAURANT_PERMIT_NUMBER 12060 non-null object 3 RESTAURANT_NAME 12060 non-null object 4 RESTAURANT_LOCATION 12060 non-null object 5 RESTAURANT_CATEGORY 12060 non-null object 6 ADDRESS 12060 non-null object 7 CITY 12060 non-null object 8 STATE 12060 non-null object 9 ZIP 12060 non-null object 10 CURRENT_DEMERITS 12060 non-null float64 11 CURRENT_GRADE 12060 non-null object 12 EMPLOYEE_COUNT 12060 non-null float64 13 MEDIAN_EMPLOYEE_AGE 12060 non-null float64 14 MEDIAN_EMPLOYEE_TENURE 12060 non-null float64 15 INSPECTION_TYPE 12060 non-null object 16 INSPECTION_DEMERITS 12060 non-null object 17 VIOLATIONS_RAW 12060 non-null object 18 RECORD_UPDATED 12060 non-null object 19 LAT_LONG_RAW 12060 non-null object 20 FIRST_VIOLATION 12060 non-null float64 21 SECOND_VIOLATION 12060 non-null float64 22 THIRD_VIOLATION 12060 non-null float64 23 FIRST_VIOLATION_TYPE 12060 non-null object 24 SECOND_VIOLATION_TYPE 12060 non-null object 25 THIRD_VIOLATION_TYPE 12060 non-null object 26 NUMBER_OF_VIOLATIONS 12060 non-null object 27 NEXT_INSPECTION_GRADE_C_OR_BELOW 12060 non-null object dtypes: float64(7), object(21) memory usage: 2.7+ MB
df=df.reset_index(drop='true')
df
| INSPECTION_TIME | RESTAURANT_SERIAL_NUMBER | RESTAURANT_PERMIT_NUMBER | RESTAURANT_NAME | RESTAURANT_LOCATION | RESTAURANT_CATEGORY | ADDRESS | CITY | STATE | ZIP | ... | RECORD_UPDATED | LAT_LONG_RAW | FIRST_VIOLATION | SECOND_VIOLATION | THIRD_VIOLATION | FIRST_VIOLATION_TYPE | SECOND_VIOLATION_TYPE | THIRD_VIOLATION_TYPE | NUMBER_OF_VIOLATIONS | NEXT_INSPECTION_GRADE_C_OR_BELOW | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1/10/2012 10:25 | DA0916254 | PR0010827 | MCDONALDS #23038 | MCDONALDS #23038 | Restaurant | 5811 E Charleston Blvd | Las Vegas | Nevada | 89142 | ... | 2/21/2013 22:26 | (36.1589722, 115.0469673) | 202.0 | 211.0 | 215.0 | Critical | Major | Major | 10 | 0 |
| 1 | 1/10/2012 11:30 | DA0915710 | PR0021625 | Bears Best Las Vegas Restaurant | Bears Best Las Vegas Lp | Restaurant | 11111 W Flamingo Rd | Las Vegas | Nevada | 89135 | ... | 2/21/2013 22:26 | (36.120164, 115.335163) | 209.0 | 214.0 | 215.0 | Critical | Major | Major | 8 | 0 |
| 2 | 1/10/2012 12:10 | DA0916074 | PR0021103 | MARISCOS EL MARIACHI | MARISCOS EL MARIACHI | Restaurant | 552 N Eastern Ave A | Las Vegas | Nevada | 89101 | ... | 2/21/2013 22:26 | (36.1723554, 115.1160331) | 202.0 | 209.0 | 214.0 | Critical | Critical | Major | 7 | 0 |
| 3 | 1/10/2012 12:30 | DA0915660 | PR0016470 | New China Buffet | New China Buffet | Buffet | 1510 E Flamingo Rd | Las Vegas | Nevada | 89119 | ... | 2/21/2013 22:26 | (36.115612, 115.13276) | 202.0 | 206.0 | 209.0 | Critical | Critical | Critical | 20 | 1 |
| 4 | 1/10/2012 12:55 | DA0916183 | PR0017676 | Opera House Restaurant | Opera House | Restaurant | 2542 N Las Vegas Blvd | North Las Vegas | Nevada | 89030 | ... | 2/21/2013 22:26 | (36.2057231, 115.1129594) | 213.0 | 222.0 | 225.0 | Major | Major | Non-Major | 6 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 12055 | 9/9/2016 12:25 | DA0ODTZQF | PR0018495 | Teriyaki Madness | Teriyaki Madness | Restaurant | 725 W Craig Rd | North Las Vegas | Nevada | 89032 | ... | 9/12/2016 9:59 | (36.239207, 115.153386) | 212.0 | 213.0 | 215.0 | Major | Major | Major | 3 | 0 |
| 12056 | 9/9/2016 15:10 | DAK7RVZ6O | PR0114198 | SMASHED PIG - RESTAURANT | SMASHED PIG GASTROPUB, THE | Restaurant | 509 FREMONT | Las Vegas | Nevada | 89101 | ... | 9/9/2016 16:05 | (36.1692635, 115.1406902) | 202.0 | 208.0 | 211.0 | Critical | Critical | Major | 10 | 0 |
| 12057 | 9/9/2016 8:00 | DAEOJHXFM | PR0011660 | SOUTH STRIP TRAVELODGE CONT BREAKFAST | SOUTH STRIP TRAVELODGE | Snack Bar | 3735 S Las Vegas Blvd | Las Vegas | Nevada | 89109 | ... | 9/9/2016 9:57 | (36.107315, 115.170976) | 212.0 | 213.0 | 215.0 | Major | Major | Major | 8 | 0 |
| 12058 | 9/9/2016 9:00 | DAX8DYZ7Y | PR0021500 | Afandi Restaurant | Afandi Restaurant | Restaurant | 5181 W Charleston Blvd | Las Vegas | Nevada | 89146 | ... | 9/9/2016 16:26 | (36.1587543, 115.2095993) | 208.0 | 209.0 | 216.0 | Critical | Critical | Major | 8 | 0 |
| 12059 | 9/9/2016 9:45 | DAYQX02H2 | PR0010829 | MCDONALDS #17690 | MCDONALDS #17690 | Restaurant | 6020 W Tropicana Ave | Las Vegas | Nevada | 89103 | ... | 9/9/2016 10:43 | (36.100462, 115.224075) | 202.0 | 209.0 | 211.0 | Critical | Critical | Major | 6 | 0 |
12060 rows × 28 columns
City Restaurant Type Analysis
df['year']=df['INSPECTION_TIME'].apply(lambda x:x.split('/')[0])
df['month']=df['INSPECTION_TIME'].apply(lambda x:x.split('/')[1])
df['day']=df['INSPECTION_TIME'].apply(lambda x:x.split('/')[2].split('T')[0])
#df.to_csv('TRAIN2.CSV',index=False) taking the train data to Tableau for EDA
# compression_opts = dict(method='zip',
# archive_name='TRAIN2.csv')
# df.to_csv('TRAIN2.zip', index=False,
# compression=compression_opts)
df.CURRENT_GRADE.unique() # Sneaky last field
array(['A', 'X', 'O', 'B', 'N', 'C'], dtype=object)
delLIST1=df[df.CURRENT_GRADE=='O'].index
delLIST1
for each in delLIST1:
df = df.drop(each)
df[df.CURRENT_GRADE=='O'] #test, ez loop works
| INSPECTION_TIME | RESTAURANT_SERIAL_NUMBER | RESTAURANT_PERMIT_NUMBER | RESTAURANT_NAME | RESTAURANT_LOCATION | RESTAURANT_CATEGORY | ADDRESS | CITY | STATE | ZIP | ... | SECOND_VIOLATION | THIRD_VIOLATION | FIRST_VIOLATION_TYPE | SECOND_VIOLATION_TYPE | THIRD_VIOLATION_TYPE | NUMBER_OF_VIOLATIONS | NEXT_INSPECTION_GRADE_C_OR_BELOW | year | month | day |
|---|
0 rows × 31 columns
delLIST2=df[df.CURRENT_GRADE=='X'].index
delLIST2
for each in delLIST2:
df = df.drop(each)
delLIST3=df[df.CURRENT_GRADE=='N'].index #realize in the end i could have made a loop and saved 15 lines of code
delLIST3
for each in delLIST3:
df = df.drop(each)
df.CURRENT_GRADE.unique() #final confirmation
array(['A', 'B', 'C'], dtype=object)
df=df.reset_index(drop='true')
df
df.to_csv('TRAIN3.CSV',index=False)# getting data ready for Tableau
#PRE PROCESSING
df.columns
Index(['INSPECTION_TIME', 'RESTAURANT_SERIAL_NUMBER',
'RESTAURANT_PERMIT_NUMBER', 'RESTAURANT_NAME', 'RESTAURANT_LOCATION',
'RESTAURANT_CATEGORY', 'ADDRESS', 'CITY', 'STATE', 'ZIP',
'CURRENT_DEMERITS', 'CURRENT_GRADE', 'EMPLOYEE_COUNT',
'MEDIAN_EMPLOYEE_AGE', 'MEDIAN_EMPLOYEE_TENURE', 'INSPECTION_TYPE',
'INSPECTION_DEMERITS', 'VIOLATIONS_RAW', 'RECORD_UPDATED',
'LAT_LONG_RAW', 'FIRST_VIOLATION', 'SECOND_VIOLATION',
'THIRD_VIOLATION', 'FIRST_VIOLATION_TYPE', 'SECOND_VIOLATION_TYPE',
'THIRD_VIOLATION_TYPE', 'NUMBER_OF_VIOLATIONS',
'NEXT_INSPECTION_GRADE_C_OR_BELOW', 'year', 'month', 'day'],
dtype='object')
def preprocess_inputs(df):
df = df.copy()
# First wave of dropped columns
df = df.drop(['RESTAURANT_SERIAL_NUMBER',
'RESTAURANT_PERMIT_NUMBER', 'RESTAURANT_NAME',
'RESTAURANT_LOCATION','ADDRESS', 'CITY',
'STATE','INSPECTION_TIME','VIOLATIONS_RAW',
'RECORD_UPDATED','LAT_LONG_RAW',], axis=1)
# Convert ZIP into a string column
df['ZIP'] = df['ZIP'].astype(str)
# One-hot encode remaining categorical columns
for each in df.select_dtypes('object').columns.drop('NEXT_INSPECTION_GRADE_C_OR_BELOW'):
dummies = pd.get_dummies(df[each], prefix=each)
df = pd.concat([df, dummies], axis=1)
df = df.drop(each, axis=1)
# Split df into X and y
y = df['NEXT_INSPECTION_GRADE_C_OR_BELOW']
X = df.drop('NEXT_INSPECTION_GRADE_C_OR_BELOW', axis=1)
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
# Scale X
scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
return X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = preprocess_inputs(df)
X_train
| CURRENT_DEMERITS | EMPLOYEE_COUNT | MEDIAN_EMPLOYEE_AGE | MEDIAN_EMPLOYEE_TENURE | FIRST_VIOLATION | SECOND_VIOLATION | THIRD_VIOLATION | RESTAURANT_CATEGORY_Bakery Sales | RESTAURANT_CATEGORY_Banquet Kitchen | RESTAURANT_CATEGORY_Banquet Support | ... | day_2017 8:55 | day_2017 9:00 | day_2017 9:10 | day_2017 9:15 | day_2017 9:20 | day_2017 9:25 | day_2017 9:30 | day_2017 9:35 | day_2017 9:40 | day_2017 9:45 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 8618 | -1.115954 | -0.020962 | 0.300697 | 0.251752 | 0.467637 | 0.428391 | 0.386951 | -0.055815 | -0.070153 | -0.032805 | ... | -0.01093 | -0.015458 | -0.01093 | -0.01093 | -0.01093 | -0.018933 | -0.018933 | 0.0 | -0.01093 | -0.015458 |
| 901 | -0.431084 | -0.001255 | 2.007250 | -0.431320 | 0.455373 | 0.465771 | 0.464324 | -0.055815 | -0.070153 | -0.032805 | ... | -0.01093 | -0.015458 | -0.01093 | -0.01093 | -0.01093 | -0.018933 | -0.018933 | 0.0 | -0.01093 | -0.015458 |
| 6940 | -1.115954 | -0.003718 | 0.270280 | -0.567681 | 0.504431 | 0.528070 | 0.657756 | -0.055815 | -0.070153 | -0.032805 | ... | -0.01093 | -0.015458 | -0.01093 | -0.01093 | -0.01093 | -0.018933 | -0.018933 | 0.0 | -0.01093 | -0.015458 |
| 11462 | 0.253786 | -0.020141 | -1.064477 | -0.083892 | 0.455373 | 0.503150 | 0.464324 | -0.055815 | -0.070153 | -0.032805 | ... | -0.01093 | -0.015458 | -0.01093 | -0.01093 | -0.01093 | -0.018933 | -0.018933 | 0.0 | -0.01093 | -0.015458 |
| 4837 | 0.710366 | -0.020962 | 0.166770 | -1.334293 | -1.715422 | -1.789446 | -1.843961 | -0.055815 | -0.070153 | -0.032805 | ... | -0.01093 | -0.015458 | -0.01093 | -0.01093 | -0.01093 | -0.018933 | -0.018933 | 0.0 | -0.01093 | -0.015458 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7813 | -1.115954 | -0.015214 | 0.929464 | -0.675117 | -1.874859 | -1.851745 | -1.947124 | -0.055815 | -0.070153 | -0.032805 | ... | -0.01093 | -0.015458 | -0.01093 | -0.01093 | -0.01093 | -0.018933 | -0.018933 | 0.0 | -0.01093 | -0.015458 |
| 10955 | -1.115954 | -0.020962 | 2.251192 | -0.387718 | -1.972974 | -1.963883 | -1.908438 | -0.055815 | -0.070153 | -0.032805 | ... | -0.01093 | -0.015458 | -0.01093 | -0.01093 | -0.01093 | -0.018933 | -0.018933 | 0.0 | -0.01093 | -0.015458 |
| 905 | 0.938656 | -0.013572 | 1.233989 | -0.582226 | 0.455373 | 0.403472 | 0.386951 | -0.055815 | -0.070153 | -0.032805 | ... | -0.01093 | -0.015458 | -0.01093 | -0.01093 | -0.01093 | -0.018933 | -0.018933 | 0.0 | -0.01093 | -0.015458 |
| 5192 | -0.431084 | -0.015214 | -0.027823 | -1.528446 | 0.590281 | 0.528070 | 0.696442 | -0.055815 | -0.070153 | -0.032805 | ... | -0.01093 | -0.015458 | -0.01093 | -0.01093 | -0.01093 | -0.018933 | -0.018933 | 0.0 | -0.01093 | -0.015458 |
| 235 | -0.431084 | -0.020962 | 1.462870 | 0.449396 | -1.862595 | -1.689768 | -1.831065 | -0.055815 | -0.070153 | -0.032805 | ... | -0.01093 | -0.015458 | -0.01093 | -0.01093 | -0.01093 | -0.018933 | -0.018933 | 0.0 | -0.01093 | -0.015458 |
8372 rows × 1511 columns
y_train
8618 0
901 1
6940 0
11462 1
4837 1
..
7813 0
10955 1
905 0
5192 0
235 0
Name: NEXT_INSPECTION_GRADE_C_OR_BELOW, Length: 8372, dtype: object
model = LogisticRegression() #I have decided that a Logistic Regression
#is honestly the simplest form of model and also fits the request of SIMPLE
model.fit(X_train, y_train)
LogisticRegression()
acc = model.score(X_test, y_test)
print("Test Accuracy: {:.2f}%".format(acc * 100))
Test Accuracy: 81.81%
# Confusion matrix and classification
y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred, labels=['0', '1'])
clr = classification_report(y_test, y_pred, labels=['0', '1'])
plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, vmin=0, fmt='g', cmap='Blues', cbar=False)
plt.xticks(ticks=[0.5, 1.5], labels=['0', '1'])
plt.yticks(ticks=[0.5, 1.5], labels=['0', '1'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()
print("Classification Report:\n----------------------\n", clr)
Classification Report:
----------------------
precision recall f1-score support
0 0.84 0.96 0.90 3023
1 0.21 0.05 0.09 566
accuracy 0.82 3589
macro avg 0.53 0.51 0.49 3589
weighted avg 0.74 0.82 0.77 3589
most_important_feature = np.argmax(model.coef_)
X_train.columns[most_important_feature]
'FIRST_VIOLATION_TYPE_Imminent Health Hazard'
• What techniques or metrics did you use to identify potential drift? • What columns have changed and in what ways? • How would you assess the overall impact of the affected columns on the model performance? • How would you address your findings with the business partner? What would you recommend?
#fair disclaimer up untill this moment I had no idea what drift analysis was.
data_drift = pd.read_csv('PRODUCTION_SAMPLE.csv')
data_drift.head(5)
| RESTAURANT_SERIAL_NUMBER | RESTAURANT_PERMIT_NUMBER | RESTAURANT_NAME | RESTAURANT_LOCATION | RESTAURANT_CATEGORY | ADDRESS | CITY | STATE | ZIP | CURRENT_DEMERITS | ... | SECOND_VIOLATION | THIRD_VIOLATION | FIRST_VIOLATION_TYPE | SECOND_VIOLATION_TYPE | THIRD_VIOLATION_TYPE | NUMBER_OF_VIOLATIONS | EMPLOYEE_COUNT | MEDIAN_EMPLOYEE_AGE | MEDIAN_EMPLOYEE_TENURE | recordYear | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | DAAEIGTRQ | PR0007720 | Sunset Station Seville Bar | Sunset Station Hotel & Casino | Bar / Tavern | 1301 W Sunset Rd | Las Vegas | Nevada | 89014 | 9 | ... | 218 | 222 | Major | Major | Major | 4 | 5.201612 | 76.617155 | 2.727325 | 2017 |
| 1 | DATHNYO3S | PR0012115 | CARNICERIA LOS PRIMOS | CARNICERIA LOS PRIMOS | Meat/Poultry/Seafood | 1600 N JONES | Las Vegas | Nevada | 89108 | 17 | ... | 211 | 213 | Critical | Major | Major | 6 | 42.029730 | 73.185672 | 2.348484 | 2017 |
| 2 | DA9L9W4ME | PR0024170 | LA CASITA DE DONA MACHI | LA CASITA DE DONA MACHI | Restaurant | 2407 S EASTERN Ave | Las Vegas | Nevada | 89104 | 39 | ... | 203 | 208 | Critical | Critical | Critical | 12 | 14.767160 | 74.734367 | 6.370513 | 2017 |
| 3 | DATFZLJL1 | PR0124347 | MIAN | MIAN | Restaurant | 4355 SPRING MOUNTAIN | Las Vegas | Nevada | 89102 | 36 | ... | 208 | 209 | Critical | Critical | Critical | 11 | 17.702804 | 60.093966 | 3.629311 | 2017 |
| 4 | DACAH1LFB | PR0099336 | SHEGAR ETHIOPIAN RESTAURANT | Shegar Ethiopian Market | Restaurant | 5785 W Tropicana Ave 4 | Las Vegas | Nevada | 89103-5135 | 19 | ... | 206 | 215 | Critical | Critical | Major | 5 | 18.302526 | 74.317389 | 4.996175 | 2017 |
5 rows × 28 columns
data_drift.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7505 entries, 0 to 7504 Data columns (total 28 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 RESTAURANT_SERIAL_NUMBER 7505 non-null object 1 RESTAURANT_PERMIT_NUMBER 7505 non-null object 2 RESTAURANT_NAME 7505 non-null object 3 RESTAURANT_LOCATION 7504 non-null object 4 RESTAURANT_CATEGORY 7505 non-null object 5 ADDRESS 7503 non-null object 6 CITY 7502 non-null object 7 STATE 7505 non-null object 8 ZIP 7426 non-null object 9 CURRENT_DEMERITS 7505 non-null int64 10 CURRENT_GRADE 7503 non-null object 11 INSPECTION_TIME 7502 non-null object 12 INSPECTION_TYPE 7505 non-null object 13 INSPECTION_DEMERITS 7505 non-null int64 14 VIOLATIONS_RAW 7505 non-null object 15 RECORD_UPDATED 7505 non-null object 16 LAT_LONG_RAW 7505 non-null object 17 FIRST_VIOLATION 7505 non-null float64 18 SECOND_VIOLATION 7505 non-null int64 19 THIRD_VIOLATION 7505 non-null int64 20 FIRST_VIOLATION_TYPE 7505 non-null object 21 SECOND_VIOLATION_TYPE 7505 non-null object 22 THIRD_VIOLATION_TYPE 7505 non-null object 23 NUMBER_OF_VIOLATIONS 7505 non-null int64 24 EMPLOYEE_COUNT 7505 non-null float64 25 MEDIAN_EMPLOYEE_AGE 7505 non-null float64 26 MEDIAN_EMPLOYEE_TENURE 7505 non-null float64 27 recordYear 7505 non-null int64 dtypes: float64(4), int64(6), object(18) memory usage: 1.6+ MB
from evidently.dashboard import Dashboard
from evidently.tabs import DataDriftTab, CatTargetDriftTab
from evidently.model_profile import Profile
from evidently.profile_sections import DataDriftProfileSection, CatTargetDriftProfileSection
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
I am going to be performing the same cleaning steps to the production dataset
drift_frame = pd.DataFrame(data_drift, columns = data_drift.columns)
drift_frame
| RESTAURANT_SERIAL_NUMBER | RESTAURANT_PERMIT_NUMBER | RESTAURANT_NAME | RESTAURANT_LOCATION | RESTAURANT_CATEGORY | ADDRESS | CITY | STATE | ZIP | CURRENT_DEMERITS | ... | SECOND_VIOLATION | THIRD_VIOLATION | FIRST_VIOLATION_TYPE | SECOND_VIOLATION_TYPE | THIRD_VIOLATION_TYPE | NUMBER_OF_VIOLATIONS | EMPLOYEE_COUNT | MEDIAN_EMPLOYEE_AGE | MEDIAN_EMPLOYEE_TENURE | recordYear | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | DAAEIGTRQ | PR0007720 | Sunset Station Seville Bar | Sunset Station Hotel & Casino | Bar / Tavern | 1301 W Sunset Rd | Las Vegas | Nevada | 89014 | 9 | ... | 218 | 222 | Major | Major | Major | 4 | 5.201612 | 76.617155 | 2.727325 | 2017 |
| 1 | DATHNYO3S | PR0012115 | CARNICERIA LOS PRIMOS | CARNICERIA LOS PRIMOS | Meat/Poultry/Seafood | 1600 N JONES | Las Vegas | Nevada | 89108 | 17 | ... | 211 | 213 | Critical | Major | Major | 6 | 42.029730 | 73.185672 | 2.348484 | 2017 |
| 2 | DA9L9W4ME | PR0024170 | LA CASITA DE DONA MACHI | LA CASITA DE DONA MACHI | Restaurant | 2407 S EASTERN Ave | Las Vegas | Nevada | 89104 | 39 | ... | 203 | 208 | Critical | Critical | Critical | 12 | 14.767160 | 74.734367 | 6.370513 | 2017 |
| 3 | DATFZLJL1 | PR0124347 | MIAN | MIAN | Restaurant | 4355 SPRING MOUNTAIN | Las Vegas | Nevada | 89102 | 36 | ... | 208 | 209 | Critical | Critical | Critical | 11 | 17.702804 | 60.093966 | 3.629311 | 2017 |
| 4 | DACAH1LFB | PR0099336 | SHEGAR ETHIOPIAN RESTAURANT | Shegar Ethiopian Market | Restaurant | 5785 W Tropicana Ave 4 | Las Vegas | Nevada | 89103-5135 | 19 | ... | 206 | 215 | Critical | Critical | Major | 5 | 18.302526 | 74.317389 | 4.996175 | 2017 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7500 | DA1033935 | PR0101112 | ACT NIGHTCLUB ANTECHAMBER BAR | ACT NIGHTCLUB @ PALAZZO | Bar / Tavern | 3325 S LAS VEGAS Blvd 3400 | Las Vegas | Nevada | 89109 | 6 | ... | 215 | 223 | Critical | Major | Major | 6 | 4.912385 | 19.766923 | 2.441293 | 2013 |
| 7501 | DA1036549 | PR0101113 | ACT NIGHTCLUB MAIN BAR | ACT NIGHTCLUB @ PALAZZO | Coffee Shop | 3325 S LAS VEGAS Blvd 3400 | Henderson | Nevada | 89109 | 6 | ... | 223 | 227 | Major | Major | Non-Major | 5 | 4.912498 | 35.465613 | 4.601223 | 2013 |
| 7502 | DA56ESJSM | PR0099943 | DENNYS #2337 SERVICE STATION | DENNYS #2337 | Snack Bar | 450 E FREMONT | Las Vegas | Nevada | 89101 | 3 | ... | 211 | 218 | 1 | Major | Major | 5 | 9.813940 | 18.116409 | 2.439122 | 2016 |
| 7503 | DA1035870 | PR0006285 | GOODTIMES NIGHTCLUB SERVICE BAR | GOODTIMES NIGHTCLUB | Bar / Tavern | 1775 E Tropicana Ave | Las Vegas | Nevada | 89119-6529 | 0 | ... | 230 | 232 | Non-Major | Non-Major | Non-Major | 4 | 6.421581 | 21.909961 | 2.091781 | 2013 |
| 7504 | DA1036087 | PR0006284 | GOODTIMES NIGHTCLUB BAR | GOODTIMES NIGHTCLUB | Bar / Tavern | 1775 E Tropicana Ave | Las Vegas | Nevada | 89119-6529 | 8 | ... | 214 | 230 | Major | Major | Non-Major | 5 | 20.449394 | 23.608468 | 3.387937 | 2013 |
7505 rows × 28 columns
start cleaning the city variable becuase i will be deleteing every column before this field
drift_frame.CITY.unique()
array(['Las Vegas', 'Henderson', 'North Las Vegas', 'Mesquite',
'Logandale', 'Boulder City', 'Primm', 'Laughlin', 'Jean',
'Indian Springs', 'Sandy Valley', 'Overton', 'Searchlight',
'Clark County', nan, 'Moapa', 'Blue Diamond', 'Goodsprings'],
dtype=object)
#drift_frame[drift_frame['CITY']=='nan']
#drift_frame[drift_frame['CITY']=='Goodsprings']
drift_frame.STATE.unique()
array(['Nevada'], dtype=object)
ZIP = drift_frame['ZIP'].unique()
ZIP #need to check if everything in this array starts with the digit 8
res = all(str(i)[0] == str(ZIP[0])[0] for i in ZIP)
res #there is a zipcode that does not start with 8
False
drift_frame['ZIP'] = drift_frame['ZIP'].str[:5]
drift_frame['ZIP'].unique() #91301 is a california Area code
array(['89014', '89108', '89104', '89102', '89103', '89147', '89146',
'89149', '89032', '89121', '89169', nan, '89107', '89129', '89130',
'89101', '89109', '89124', '89138', '89027', '89119', '89123',
'89113', '89031', '89084', '89052', '89183', '89117', '89135',
'89128', '89012', '89081', '89115', '89141', '89158', '89148',
'89110', '89139', '89118', '89127', '89156', '89030', '89005',
'89145', '89131', '89029', '89122', '89114', '89120', '89143',
'89111', '89134', '89142', '89144', '89015', '89011', '88108',
'80109', '89017', '89004', '89019', '89018', '89106', '89074',
'89165', '89186', '89166', '89179', '89178', '89002', '89086',
'89046', '55121', '89040', '89161', '89044', '89021', '89010',
'89036', '89085', '89034', '89155', '89025'], dtype=object)
drift_frame[drift_frame.ZIP=='55121']
#https://www.google.com/search?q=2255+E+CENTENNIAL+Pkwy&rlz=1C5CHFA_enUS941US944&oq=2255+E+CENTENNIAL+Pkwy&aqs=chrome..69i57j0i512.404j0j7&sourceid=chrome&ie=UTF-8
| RESTAURANT_SERIAL_NUMBER | RESTAURANT_PERMIT_NUMBER | RESTAURANT_NAME | RESTAURANT_LOCATION | RESTAURANT_CATEGORY | ADDRESS | CITY | STATE | ZIP | CURRENT_DEMERITS | ... | SECOND_VIOLATION | THIRD_VIOLATION | FIRST_VIOLATION_TYPE | SECOND_VIOLATION_TYPE | THIRD_VIOLATION_TYPE | NUMBER_OF_VIOLATIONS | EMPLOYEE_COUNT | MEDIAN_EMPLOYEE_AGE | MEDIAN_EMPLOYEE_TENURE | recordYear | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1513 | DA1278592 | PR0107738 | ROSHI @ SMITHS #334 | ROSHI @ SMITHS #334 | Snack Bar | 2255 E CENTENNIAL Pkwy | North Las Vegas | Nevada | 55121 | 6 | ... | 217 | 220 | Major | Major | Major | 4 | 16.795442 | 39.798429 | 4.246214 | 2014 |
1 rows × 28 columns
drift_frame.at[1513,'ZIP']='89081' #need to fix this 55121 zipcode that is in las vegas
drift_frame[drift_frame.ZIP==None]
drift_frame[drift_frame.ZIP=='nan']
| RESTAURANT_SERIAL_NUMBER | RESTAURANT_PERMIT_NUMBER | RESTAURANT_NAME | RESTAURANT_LOCATION | RESTAURANT_CATEGORY | ADDRESS | CITY | STATE | ZIP | CURRENT_DEMERITS | ... | SECOND_VIOLATION | THIRD_VIOLATION | FIRST_VIOLATION_TYPE | SECOND_VIOLATION_TYPE | THIRD_VIOLATION_TYPE | NUMBER_OF_VIOLATIONS | EMPLOYEE_COUNT | MEDIAN_EMPLOYEE_AGE | MEDIAN_EMPLOYEE_TENURE | recordYear |
|---|
0 rows × 28 columns
#phatnom NAN
drift_frame['ZIP'].unique() #91301 is a california Area code
array(['89014', '89108', '89104', '89102', '89103', '89147', '89146',
'89149', '89032', '89121', '89169', nan, '89107', '89129', '89130',
'89101', '89109', '89124', '89138', '89027', '89119', '89123',
'89113', '89031', '89084', '89052', '89183', '89117', '89135',
'89128', '89012', '89081', '89115', '89141', '89158', '89148',
'89110', '89139', '89118', '89127', '89156', '89030', '89005',
'89145', '89131', '89029', '89122', '89114', '89120', '89143',
'89111', '89134', '89142', '89144', '89015', '89011', '88108',
'80109', '89017', '89004', '89019', '89018', '89106', '89074',
'89165', '89186', '89166', '89179', '89178', '89002', '89086',
'89046', '89040', '89161', '89044', '89021', '89010', '89036',
'89085', '89034', '89155', '89025'], dtype=object)
all(isinstance(x, (int, float)) for x in drift_frame['CURRENT_DEMERITS'])
#CURRENT_DEMERITS seems to be ok
True
drift_frame.CURRENT_GRADE.unique() #grades seem to be ok.....not
array(['A', 'B', 'C', 'X', 'M', 'Elephant', 'O', 'N', 'VVVVVV', nan],
dtype=object)
#drift_frame[drift_frame.CURRENT_GRADE=='X']
letters_to_delete = ['X', 'M', 'Elephant', 'O', 'N', 'VVVVVV','nan']
for each_letter in letters_to_delete:
delLIST=drift_frame[drift_frame.CURRENT_GRADE==each_letter].index
delLIST
for each in delLIST:
drift_frame = drift_frame.drop(each)
drift_frame.CURRENT_GRADE.unique() #grades seem to be ok.....not
array(['A', 'B', 'C', nan], dtype=object)
all(isinstance(x, (int, float)) for x in drift_frame['INSPECTION_DEMERITS'])
True
drift_frame.INSPECTION_TYPE.unique()# unearthing some funnys
array(['Routine Inspection', 'Re-inspection', 'Gordon Ramsay Visit',
'Epidemiological Investigation'], dtype=object)
drift_frame[drift_frame.INSPECTION_TYPE=='Gordon Ramsay Visit']
drift_frame = drift_frame.drop(2716) #deleting funny, we statistically got less funny with this edit
drift_frame[drift_frame.INSPECTION_TYPE=='Epidemiological Investigation']
drift_frame = drift_frame.drop(6825)
drift_frame.INSPECTION_TYPE.unique()# unearthing some funnys
array(['Routine Inspection', 'Re-inspection'], dtype=object)
all(isinstance(x, (int, float)) for x in drift_frame['VIOLATIONS_RAW'])
False
drift_frame['VIOLATIONS_RAW'].unique()
array(['2,152,182,222,930', '20,621,121,322,029,300,000',
'2,022,032,082,112,120,000,000,000,000,000,000,000,000', ...,
'20,221,121,829,282,900', '227,230,232,233', '212,214,230,232,233'],
dtype=object)
drift_frame2=[]
for each in drift_frame.VIOLATIONS_RAW:
string_list = each.split(",")
each = list(filter(lambda string: string != '000', string_list))
a =each
drift_frame2.append(a)
drift_frame.VIOLATIONS_RAW = drift_frame2
test2=[]
for each in drift_frame.VIOLATIONS_RAW:
a=(any(c.isalpha() for c in each))
test2.append(a)
[i for i, x in enumerate(test2) if x]
[]
[i for i, item in enumerate(drift_frame.VIOLATIONS_RAW) if item is None]
[]
print(all(isinstance(x, (int, float)) for x in drift_frame['FIRST_VIOLATION']))
print(all(isinstance(x, (int, float)) for x in drift_frame['SECOND_VIOLATION']))
print(all(isinstance(x, (int, float)) for x in drift_frame['THIRD_VIOLATION']))
True True True
print(drift_frame.FIRST_VIOLATION_TYPE.unique() )
print('-')
print(drift_frame.SECOND_VIOLATION_TYPE.unique() )
#im not sure Kitchen Nightmares and Supercritical are violation types if there is only 1 row I will delete it
print('-')
print(drift_frame.THIRD_VIOLATION_TYPE.unique() )
['Major' 'Critical' '1' 'Fuzzy' 'Non-Major'] - ['Major' 'Critical' 'Imminent Health Hazard' 'Confusing' 'Non-Major'] - ['Major' 'Critical' 'Imminent Health Hazard' 'Illogical' 'Non-Major']
drift_frame[drift_frame.SECOND_VIOLATION_TYPE=='Imminent Health Hazard'] #Do i assume 1 is 'Imminent Health Hazard'
| RESTAURANT_SERIAL_NUMBER | RESTAURANT_PERMIT_NUMBER | RESTAURANT_NAME | RESTAURANT_LOCATION | RESTAURANT_CATEGORY | ADDRESS | CITY | STATE | ZIP | CURRENT_DEMERITS | ... | SECOND_VIOLATION | THIRD_VIOLATION | FIRST_VIOLATION_TYPE | SECOND_VIOLATION_TYPE | THIRD_VIOLATION_TYPE | NUMBER_OF_VIOLATIONS | EMPLOYEE_COUNT | MEDIAN_EMPLOYEE_AGE | MEDIAN_EMPLOYEE_TENURE | recordYear | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 309 | DAFS70YJ1 | PR0004882 | EXCALIBUR BUFFET | EXCALIBUR HOTEL & CASINO | Buffet | 3850 S Las Vegas Blvd | Las Vegas | Nevada | 89109 | 5 | ... | 301 | 302 | Critical | Imminent Health Hazard | Imminent Health Hazard | 13 | 11.436405 | 72.593439 | 3.248060 | 2017 |
| 317 | DACJLY4EU | PR0111401 | ALOHA A GO GO - SVC STATION & STORAGE | Aloha a Go Go | Bar / Tavern | 5841 E Charleston Blvd 260 | Las Vegas | Nevada | 89142 | 0 | ... | 302 | 303 | 1 | Imminent Health Hazard | Imminent Health Hazard | 10 | 32.263072 | 69.768636 | 4.396931 | 2017 |
| 5485 | DA0842369 | PR0013493 | Pele & Payos Ice Cream #2 | Pele & Payos Ice Cream | Food Trucks / Mobile Vendor | 1525 Linden Ave | North Las Vegas | Nevada | 89101 | 0 | ... | 301 | 302 | 1 | Imminent Health Hazard | Imminent Health Hazard | 11 | 24.866698 | 25.719482 | 5.068924 | 2013 |
3 rows × 28 columns
looking at the table i think maybe 1 is immintent health hazard and i am proceeding as is
b=drift_frame[drift_frame.FIRST_VIOLATION_TYPE=='1'].index #Do i assume 1 is
b
Int64Index([ 28, 34, 59, 63, 66, 93, 100, 114, 121, 132,
...
7441, 7442, 7445, 7452, 7471, 7476, 7480, 7489, 7491, 7502],
dtype='int64', length=1146)
for each in b:
drift_frame.at[b,'FIRST_VIOLATION_TYPE']='Imminent Health Hazard'
print(drift_frame.FIRST_VIOLATION_TYPE.unique() )
['Major' 'Critical' 'Imminent Health Hazard' 'Fuzzy' 'Non-Major']
drift_frame[drift_frame.FIRST_VIOLATION_TYPE=='Fuzzy'] #Delete this crazy row
| RESTAURANT_SERIAL_NUMBER | RESTAURANT_PERMIT_NUMBER | RESTAURANT_NAME | RESTAURANT_LOCATION | RESTAURANT_CATEGORY | ADDRESS | CITY | STATE | ZIP | CURRENT_DEMERITS | ... | SECOND_VIOLATION | THIRD_VIOLATION | FIRST_VIOLATION_TYPE | SECOND_VIOLATION_TYPE | THIRD_VIOLATION_TYPE | NUMBER_OF_VIOLATIONS | EMPLOYEE_COUNT | MEDIAN_EMPLOYEE_AGE | MEDIAN_EMPLOYEE_TENURE | recordYear | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1523 | DA1217034 | PR0104465 | NY PIZZA RIBS & WINGS | NY PIZZA RIBS & WINGS | Restaurant | 1000 S TORREY PINES | Las Vegas | Nevada | 89107 | 6 | ... | 209 | 211 | Fuzzy | Confusing | Illogical | 11 | 3.413278 | 31.818643 | 1.064101 | 2014 |
1 rows × 28 columns
drift_frame = drift_frame.drop(1523)
print(drift_frame.FIRST_VIOLATION_TYPE.unique() )
['Major' 'Critical' 'Imminent Health Hazard' 'Non-Major']
drift_frame[drift_frame.SECOND_VIOLATION_TYPE=='Confusing '] #Deleted from cell 156
| RESTAURANT_SERIAL_NUMBER | RESTAURANT_PERMIT_NUMBER | RESTAURANT_NAME | RESTAURANT_LOCATION | RESTAURANT_CATEGORY | ADDRESS | CITY | STATE | ZIP | CURRENT_DEMERITS | ... | SECOND_VIOLATION | THIRD_VIOLATION | FIRST_VIOLATION_TYPE | SECOND_VIOLATION_TYPE | THIRD_VIOLATION_TYPE | NUMBER_OF_VIOLATIONS | EMPLOYEE_COUNT | MEDIAN_EMPLOYEE_AGE | MEDIAN_EMPLOYEE_TENURE | recordYear |
|---|
0 rows × 28 columns
print(drift_frame.SECOND_VIOLATION_TYPE.unique() )
['Major' 'Critical' 'Imminent Health Hazard' 'Non-Major']
print(drift_frame.THIRD_VIOLATION_TYPE.unique() )
['Major' 'Critical' 'Imminent Health Hazard' 'Non-Major']
drift_frame.columns#reminder of which field I am on
Index(['RESTAURANT_SERIAL_NUMBER', 'RESTAURANT_PERMIT_NUMBER',
'RESTAURANT_NAME', 'RESTAURANT_LOCATION', 'RESTAURANT_CATEGORY',
'ADDRESS', 'CITY', 'STATE', 'ZIP', 'CURRENT_DEMERITS', 'CURRENT_GRADE',
'INSPECTION_TIME', 'INSPECTION_TYPE', 'INSPECTION_DEMERITS',
'VIOLATIONS_RAW', 'RECORD_UPDATED', 'LAT_LONG_RAW', 'FIRST_VIOLATION',
'SECOND_VIOLATION', 'THIRD_VIOLATION', 'FIRST_VIOLATION_TYPE',
'SECOND_VIOLATION_TYPE', 'THIRD_VIOLATION_TYPE', 'NUMBER_OF_VIOLATIONS',
'EMPLOYEE_COUNT', 'MEDIAN_EMPLOYEE_AGE', 'MEDIAN_EMPLOYEE_TENURE',
'recordYear'],
dtype='object')
print(all(isinstance(x, (int, float)) for x in drift_frame['NUMBER_OF_VIOLATIONS']))
print(all(isinstance(x, (int, float)) for x in drift_frame['EMPLOYEE_COUNT']))
print(all(isinstance(x, (int, float)) for x in drift_frame['MEDIAN_EMPLOYEE_AGE']))
print(all(isinstance(x, (int, float)) for x in drift_frame['MEDIAN_EMPLOYEE_TENURE']))
print(all(isinstance(x, (int, float)) for x in drift_frame['recordYear']))
True True True True True
print(drift_frame.recordYear.unique() )
[2017 2016 2015 2014 2013 2012 2011]
drift_frame=drift_frame.dropna() #delete missing
drift_frame.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 7343 entries, 0 to 7501 Data columns (total 28 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 RESTAURANT_SERIAL_NUMBER 7343 non-null object 1 RESTAURANT_PERMIT_NUMBER 7343 non-null object 2 RESTAURANT_NAME 7343 non-null object 3 RESTAURANT_LOCATION 7343 non-null object 4 RESTAURANT_CATEGORY 7343 non-null object 5 ADDRESS 7343 non-null object 6 CITY 7343 non-null object 7 STATE 7343 non-null object 8 ZIP 7343 non-null object 9 CURRENT_DEMERITS 7343 non-null int64 10 CURRENT_GRADE 7343 non-null object 11 INSPECTION_TIME 7343 non-null object 12 INSPECTION_TYPE 7343 non-null object 13 INSPECTION_DEMERITS 7343 non-null int64 14 VIOLATIONS_RAW 7343 non-null object 15 RECORD_UPDATED 7343 non-null object 16 LAT_LONG_RAW 7343 non-null object 17 FIRST_VIOLATION 7343 non-null float64 18 SECOND_VIOLATION 7343 non-null int64 19 THIRD_VIOLATION 7343 non-null int64 20 FIRST_VIOLATION_TYPE 7343 non-null object 21 SECOND_VIOLATION_TYPE 7343 non-null object 22 THIRD_VIOLATION_TYPE 7343 non-null object 23 NUMBER_OF_VIOLATIONS 7343 non-null int64 24 EMPLOYEE_COUNT 7343 non-null float64 25 MEDIAN_EMPLOYEE_AGE 7343 non-null float64 26 MEDIAN_EMPLOYEE_TENURE 7343 non-null float64 27 recordYear 7343 non-null int64 dtypes: float64(4), int64(6), object(18) memory usage: 1.6+ MB
CLEANING DONE
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
import json
import pandas as pd
import numpy as np
import requests
import zipfile
import io
import plotly.offline as py #working offline
import plotly.graph_objs as go
from evidently.pipeline.column_mapping import ColumnMapping
from evidently.model_profile import Profile
from evidently.profile_sections import DataDriftProfileSection
import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient
py.init_notebook_mode()
drift_frame.columns
Index(['RESTAURANT_SERIAL_NUMBER', 'RESTAURANT_PERMIT_NUMBER',
'RESTAURANT_NAME', 'RESTAURANT_LOCATION', 'RESTAURANT_CATEGORY',
'ADDRESS', 'CITY', 'STATE', 'ZIP', 'CURRENT_DEMERITS', 'CURRENT_GRADE',
'INSPECTION_TIME', 'INSPECTION_TYPE', 'INSPECTION_DEMERITS',
'VIOLATIONS_RAW', 'RECORD_UPDATED', 'LAT_LONG_RAW', 'FIRST_VIOLATION',
'SECOND_VIOLATION', 'THIRD_VIOLATION', 'FIRST_VIOLATION_TYPE',
'SECOND_VIOLATION_TYPE', 'THIRD_VIOLATION_TYPE', 'NUMBER_OF_VIOLATIONS',
'EMPLOYEE_COUNT', 'MEDIAN_EMPLOYEE_AGE', 'MEDIAN_EMPLOYEE_TENURE',
'recordYear'],
dtype='object')
drift_frame.head(5)
| RESTAURANT_SERIAL_NUMBER | RESTAURANT_PERMIT_NUMBER | RESTAURANT_NAME | RESTAURANT_LOCATION | RESTAURANT_CATEGORY | ADDRESS | CITY | STATE | ZIP | CURRENT_DEMERITS | ... | SECOND_VIOLATION | THIRD_VIOLATION | FIRST_VIOLATION_TYPE | SECOND_VIOLATION_TYPE | THIRD_VIOLATION_TYPE | NUMBER_OF_VIOLATIONS | EMPLOYEE_COUNT | MEDIAN_EMPLOYEE_AGE | MEDIAN_EMPLOYEE_TENURE | recordYear | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | DAAEIGTRQ | PR0007720 | Sunset Station Seville Bar | Sunset Station Hotel & Casino | Bar / Tavern | 1301 W Sunset Rd | Las Vegas | Nevada | 89014 | 9 | ... | 218 | 222 | Major | Major | Major | 4 | 5.201612 | 76.617155 | 2.727325 | 2017 |
| 1 | DATHNYO3S | PR0012115 | CARNICERIA LOS PRIMOS | CARNICERIA LOS PRIMOS | Meat/Poultry/Seafood | 1600 N JONES | Las Vegas | Nevada | 89108 | 17 | ... | 211 | 213 | Critical | Major | Major | 6 | 42.029730 | 73.185672 | 2.348484 | 2017 |
| 2 | DA9L9W4ME | PR0024170 | LA CASITA DE DONA MACHI | LA CASITA DE DONA MACHI | Restaurant | 2407 S EASTERN Ave | Las Vegas | Nevada | 89104 | 39 | ... | 203 | 208 | Critical | Critical | Critical | 12 | 14.767160 | 74.734367 | 6.370513 | 2017 |
| 3 | DATFZLJL1 | PR0124347 | MIAN | MIAN | Restaurant | 4355 SPRING MOUNTAIN | Las Vegas | Nevada | 89102 | 36 | ... | 208 | 209 | Critical | Critical | Critical | 11 | 17.702804 | 60.093966 | 3.629311 | 2017 |
| 4 | DACAH1LFB | PR0099336 | SHEGAR ETHIOPIAN RESTAURANT | Shegar Ethiopian Market | Restaurant | 5785 W Tropicana Ave 4 | Las Vegas | Nevada | 89103 | 19 | ... | 206 | 215 | Critical | Critical | Major | 5 | 18.302526 | 74.317389 | 4.996175 | 2017 |
5 rows × 28 columns
need to delete cols that probs wont change over time unless business close.
drift_frame = drift_frame.drop(['RESTAURANT_SERIAL_NUMBER', 'RESTAURANT_PERMIT_NUMBER',
'RESTAURANT_NAME', 'RESTAURANT_LOCATION', 'RESTAURANT_CATEGORY',
'ADDRESS', 'CITY', 'STATE', 'ZIP'],axis=1)
drift_frame
| CURRENT_DEMERITS | CURRENT_GRADE | INSPECTION_TIME | INSPECTION_TYPE | INSPECTION_DEMERITS | VIOLATIONS_RAW | RECORD_UPDATED | LAT_LONG_RAW | FIRST_VIOLATION | SECOND_VIOLATION | THIRD_VIOLATION | FIRST_VIOLATION_TYPE | SECOND_VIOLATION_TYPE | THIRD_VIOLATION_TYPE | NUMBER_OF_VIOLATIONS | EMPLOYEE_COUNT | MEDIAN_EMPLOYEE_AGE | MEDIAN_EMPLOYEE_TENURE | recordYear | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 9 | A | 10/10/2017 11:30 | Routine Inspection | 9 | [2, 152, 182, 222, 930] | 10/10/2017 13:15 | (36.0604553, 115.0410892) | 215.0 | 218 | 222 | Major | Major | Major | 4 | 5.201612 | 76.617155 | 2.727325 | 2017 |
| 1 | 17 | B | 10/9/2017 15:30 | Routine Inspection | 17 | [20, 621, 121, 322, 029, 300] | 10/9/2017 16:47 | (36.1887688, 115.2235063) | 206.0 | 211 | 213 | Critical | Major | Major | 6 | 42.029730 | 73.185672 | 2.348484 | 2017 |
| 2 | 39 | C | 10/9/2017 13:40 | Routine Inspection | 45 | [2, 022, 032, 082, 112, 120] | 10/10/2017 14:53 | (36.1447368, 115.1181615) | 202.0 | 203 | 208 | Critical | Critical | Critical | 12 | 14.767160 | 74.734367 | 6.370513 | 2017 |
| 3 | 36 | C | 10/9/2017 13:35 | Routine Inspection | 36 | [20, 420, 820, 921, 221, 300] | 10/9/2017 17:51 | (36.125908, 115.197904) | 204.0 | 208 | 209 | Critical | Critical | Critical | 11 | 17.702804 | 60.093966 | 3.629311 | 2017 |
| 4 | 19 | B | 10/9/2017 13:00 | Routine Inspection | 24 | [2, 042, 062, 152, 162, 950] | 10/9/2017 15:00 | (36.100041, 115.220159) | 204.0 | 206 | 215 | Critical | Critical | Major | 5 | 18.302526 | 74.317389 | 4.996175 | 2017 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7497 | 0 | A | 1/4/2010 11:55 | Routine Inspection | 3 | [24, 31, 37] | 2/21/2013 22:26 | (36.2390564, 115.0679808) | 24.0 | 31 | 37 | Non-Major | Non-Major | Non-Major | 3 | 23.598569 | 22.094623 | 3.313007 | 2013 |
| 7498 | 6 | A | 1/4/2010 10:25 | Routine Inspection | 10 | [14, 27, 31, 36] | 2/21/2013 22:26 | (36.1089399, 115.2785772) | 14.0 | 27 | 31 | Major | Non-Major | Non-Major | 4 | 16.521002 | 28.523227 | 3.486399 | 2013 |
| 7499 | 3 | A | 1/4/2010 10:15 | Routine Inspection | 5 | [13, 31, 35] | 2/21/2013 22:26 | (36.1564007, 115.3335552) | 13.0 | 31 | 35 | Major | Non-Major | Non-Major | 3 | 15.508296 | 25.676835 | 1.849281 | 2013 |
| 7500 | 6 | A | 1/1/1900 0:00 | Routine Inspection | 18 | [202, 215, 223, 227, 230] | 2/21/2013 22:26 | (36.1247514, 115.1696661) | 202.0 | 215 | 223 | Critical | Major | Major | 6 | 4.912385 | 19.766923 | 2.441293 | 2013 |
| 7501 | 6 | A | 1/1/1900 0:00 | Routine Inspection | 12 | [215, 223, 227, 230, 233] | 2/21/2013 22:26 | (36.1247514, 115.1696661) | 215.0 | 223 | 227 | Major | Major | Non-Major | 5 | 4.912498 | 35.465613 | 4.601223 | 2013 |
7343 rows × 19 columns
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
drift_frame['CURRENT_GRADE_ENCODED'] = LabelEncoder().fit_transform(drift_frame['CURRENT_GRADE'])
drift_frame=drift_frame.reset_index(drop='true')
drift_frame
| CURRENT_DEMERITS | CURRENT_GRADE | INSPECTION_TIME | INSPECTION_TYPE | INSPECTION_DEMERITS | VIOLATIONS_RAW | RECORD_UPDATED | LAT_LONG_RAW | FIRST_VIOLATION | SECOND_VIOLATION | THIRD_VIOLATION | FIRST_VIOLATION_TYPE | SECOND_VIOLATION_TYPE | THIRD_VIOLATION_TYPE | NUMBER_OF_VIOLATIONS | EMPLOYEE_COUNT | MEDIAN_EMPLOYEE_AGE | MEDIAN_EMPLOYEE_TENURE | recordYear | CURRENT_GRADE_ENCODED | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 9 | A | 10/10/2017 11:30 | Routine Inspection | 9 | [2, 152, 182, 222, 930] | 10/10/2017 13:15 | (36.0604553, 115.0410892) | 215.0 | 218 | 222 | Major | Major | Major | 4 | 5.201612 | 76.617155 | 2.727325 | 2017 | 0 |
| 1 | 17 | B | 10/9/2017 15:30 | Routine Inspection | 17 | [20, 621, 121, 322, 029, 300] | 10/9/2017 16:47 | (36.1887688, 115.2235063) | 206.0 | 211 | 213 | Critical | Major | Major | 6 | 42.029730 | 73.185672 | 2.348484 | 2017 | 1 |
| 2 | 39 | C | 10/9/2017 13:40 | Routine Inspection | 45 | [2, 022, 032, 082, 112, 120] | 10/10/2017 14:53 | (36.1447368, 115.1181615) | 202.0 | 203 | 208 | Critical | Critical | Critical | 12 | 14.767160 | 74.734367 | 6.370513 | 2017 | 2 |
| 3 | 36 | C | 10/9/2017 13:35 | Routine Inspection | 36 | [20, 420, 820, 921, 221, 300] | 10/9/2017 17:51 | (36.125908, 115.197904) | 204.0 | 208 | 209 | Critical | Critical | Critical | 11 | 17.702804 | 60.093966 | 3.629311 | 2017 | 2 |
| 4 | 19 | B | 10/9/2017 13:00 | Routine Inspection | 24 | [2, 042, 062, 152, 162, 950] | 10/9/2017 15:00 | (36.100041, 115.220159) | 204.0 | 206 | 215 | Critical | Critical | Major | 5 | 18.302526 | 74.317389 | 4.996175 | 2017 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7338 | 0 | A | 1/4/2010 11:55 | Routine Inspection | 3 | [24, 31, 37] | 2/21/2013 22:26 | (36.2390564, 115.0679808) | 24.0 | 31 | 37 | Non-Major | Non-Major | Non-Major | 3 | 23.598569 | 22.094623 | 3.313007 | 2013 | 0 |
| 7339 | 6 | A | 1/4/2010 10:25 | Routine Inspection | 10 | [14, 27, 31, 36] | 2/21/2013 22:26 | (36.1089399, 115.2785772) | 14.0 | 27 | 31 | Major | Non-Major | Non-Major | 4 | 16.521002 | 28.523227 | 3.486399 | 2013 | 0 |
| 7340 | 3 | A | 1/4/2010 10:15 | Routine Inspection | 5 | [13, 31, 35] | 2/21/2013 22:26 | (36.1564007, 115.3335552) | 13.0 | 31 | 35 | Major | Non-Major | Non-Major | 3 | 15.508296 | 25.676835 | 1.849281 | 2013 | 0 |
| 7341 | 6 | A | 1/1/1900 0:00 | Routine Inspection | 18 | [202, 215, 223, 227, 230] | 2/21/2013 22:26 | (36.1247514, 115.1696661) | 202.0 | 215 | 223 | Critical | Major | Major | 6 | 4.912385 | 19.766923 | 2.441293 | 2013 | 0 |
| 7342 | 6 | A | 1/1/1900 0:00 | Routine Inspection | 12 | [215, 223, 227, 230, 233] | 2/21/2013 22:26 | (36.1247514, 115.1696661) | 215.0 | 223 | 227 | Major | Major | Non-Major | 5 | 4.912498 | 35.465613 | 4.601223 | 2013 | 0 |
7343 rows × 20 columns
drift_frame
| CURRENT_DEMERITS | CURRENT_GRADE | INSPECTION_TIME | INSPECTION_TYPE | INSPECTION_DEMERITS | VIOLATIONS_RAW | RECORD_UPDATED | LAT_LONG_RAW | FIRST_VIOLATION | SECOND_VIOLATION | THIRD_VIOLATION | FIRST_VIOLATION_TYPE | SECOND_VIOLATION_TYPE | THIRD_VIOLATION_TYPE | NUMBER_OF_VIOLATIONS | EMPLOYEE_COUNT | MEDIAN_EMPLOYEE_AGE | MEDIAN_EMPLOYEE_TENURE | recordYear | CURRENT_GRADE_ENCODED | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 9 | A | 10/10/2017 11:30 | Routine Inspection | 9 | [2, 152, 182, 222, 930] | 10/10/2017 13:15 | (36.0604553, 115.0410892) | 215.0 | 218 | 222 | Major | Major | Major | 4 | 5.201612 | 76.617155 | 2.727325 | 2017 | 0 |
| 1 | 17 | B | 10/9/2017 15:30 | Routine Inspection | 17 | [20, 621, 121, 322, 029, 300] | 10/9/2017 16:47 | (36.1887688, 115.2235063) | 206.0 | 211 | 213 | Critical | Major | Major | 6 | 42.029730 | 73.185672 | 2.348484 | 2017 | 1 |
| 2 | 39 | C | 10/9/2017 13:40 | Routine Inspection | 45 | [2, 022, 032, 082, 112, 120] | 10/10/2017 14:53 | (36.1447368, 115.1181615) | 202.0 | 203 | 208 | Critical | Critical | Critical | 12 | 14.767160 | 74.734367 | 6.370513 | 2017 | 2 |
| 3 | 36 | C | 10/9/2017 13:35 | Routine Inspection | 36 | [20, 420, 820, 921, 221, 300] | 10/9/2017 17:51 | (36.125908, 115.197904) | 204.0 | 208 | 209 | Critical | Critical | Critical | 11 | 17.702804 | 60.093966 | 3.629311 | 2017 | 2 |
| 4 | 19 | B | 10/9/2017 13:00 | Routine Inspection | 24 | [2, 042, 062, 152, 162, 950] | 10/9/2017 15:00 | (36.100041, 115.220159) | 204.0 | 206 | 215 | Critical | Critical | Major | 5 | 18.302526 | 74.317389 | 4.996175 | 2017 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7338 | 0 | A | 1/4/2010 11:55 | Routine Inspection | 3 | [24, 31, 37] | 2/21/2013 22:26 | (36.2390564, 115.0679808) | 24.0 | 31 | 37 | Non-Major | Non-Major | Non-Major | 3 | 23.598569 | 22.094623 | 3.313007 | 2013 | 0 |
| 7339 | 6 | A | 1/4/2010 10:25 | Routine Inspection | 10 | [14, 27, 31, 36] | 2/21/2013 22:26 | (36.1089399, 115.2785772) | 14.0 | 27 | 31 | Major | Non-Major | Non-Major | 4 | 16.521002 | 28.523227 | 3.486399 | 2013 | 0 |
| 7340 | 3 | A | 1/4/2010 10:15 | Routine Inspection | 5 | [13, 31, 35] | 2/21/2013 22:26 | (36.1564007, 115.3335552) | 13.0 | 31 | 35 | Major | Non-Major | Non-Major | 3 | 15.508296 | 25.676835 | 1.849281 | 2013 | 0 |
| 7341 | 6 | A | 1/1/1900 0:00 | Routine Inspection | 18 | [202, 215, 223, 227, 230] | 2/21/2013 22:26 | (36.1247514, 115.1696661) | 202.0 | 215 | 223 | Critical | Major | Major | 6 | 4.912385 | 19.766923 | 2.441293 | 2013 | 0 |
| 7342 | 6 | A | 1/1/1900 0:00 | Routine Inspection | 12 | [215, 223, 227, 230, 233] | 2/21/2013 22:26 | (36.1247514, 115.1696661) | 215.0 | 223 | 227 | Major | Major | Non-Major | 5 | 4.912498 | 35.465613 | 4.601223 | 2013 | 0 |
7343 rows × 20 columns
drift_frame['INSPECTION_TYPE_ENCODED'] = LabelEncoder().fit_transform(drift_frame['INSPECTION_TYPE'])
drift_frame['FIRST_VIOLATION_TYPE_ENCODED'] = LabelEncoder().fit_transform(drift_frame['FIRST_VIOLATION_TYPE'])
drift_frame['SECOND_VIOLATION_TYPE_ENCODED'] = LabelEncoder().fit_transform(drift_frame['SECOND_VIOLATION_TYPE'])
drift_frame['THIRD_VIOLATION_TYPE_ENCODED'] = LabelEncoder().fit_transform(drift_frame['THIRD_VIOLATION_TYPE'])
drift_frame=drift_frame.drop(['CURRENT_GRADE','INSPECTION_TYPE','FIRST_VIOLATION_TYPE','SECOND_VIOLATION_TYPE','THIRD_VIOLATION_TYPE'],axis=1)
drift_frame=drift_frame.drop(7341)
drift_frame=drift_frame.drop(7342)
first_column = drift_frame.pop('INSPECTION_TIME')
drift_frame.insert(0, 'INSPECTION_TIME', first_column)
drift_frame=drift_frame.reindex(index=drift_frame.index[::-1])
drift_frame=drift_frame.reset_index(drop='true')
drift_frame = drift_frame.drop(['LAT_LONG_RAW','RECORD_UPDATED'],axis=1)
type(drift_frame.INSPECTION_TIME[0])
str
drift_frame[['date','time']] = drift_frame.INSPECTION_TIME.str.split(expand=True)
drift_frame['INSPECTION_TIME'] = (pd.to_datetime(drift_frame.pop('date'), format='%m/%d/%Y') +
pd.to_timedelta(drift_frame.pop('time') + ':00'))
type(drift_frame.INSPECTION_TIME[0])
pandas._libs.tslibs.timestamps.Timestamp
VIOLATIONS_RAW_COUNT=[]
for each in drift_frame.VIOLATIONS_RAW:
count=len(set(each))
VIOLATIONS_RAW_COUNT.append(count)
drift_frame.VIOLATIONS_RAW=VIOLATIONS_RAW_COUNT
drift_frame #need to drop
| INSPECTION_TIME | CURRENT_DEMERITS | INSPECTION_DEMERITS | VIOLATIONS_RAW | FIRST_VIOLATION | SECOND_VIOLATION | THIRD_VIOLATION | NUMBER_OF_VIOLATIONS | EMPLOYEE_COUNT | MEDIAN_EMPLOYEE_AGE | MEDIAN_EMPLOYEE_TENURE | recordYear | CURRENT_GRADE_ENCODED | INSPECTION_TYPE_ENCODED | FIRST_VIOLATION_TYPE_ENCODED | SECOND_VIOLATION_TYPE_ENCODED | THIRD_VIOLATION_TYPE_ENCODED | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2010-01-04 10:15:00 | 3 | 5 | 3 | 13.0 | 31 | 35 | 3 | 15.508296 | 25.676835 | 1.849281 | 2013 | 0 | 1 | 2 | 3 | 3 |
| 1 | 2010-01-04 10:25:00 | 6 | 10 | 4 | 14.0 | 27 | 31 | 4 | 16.521002 | 28.523227 | 3.486399 | 2013 | 0 | 1 | 2 | 3 | 3 |
| 2 | 2010-01-04 11:55:00 | 0 | 3 | 3 | 24.0 | 31 | 37 | 3 | 23.598569 | 22.094623 | 3.313007 | 2013 | 0 | 1 | 3 | 3 | 3 |
| 3 | 2010-01-04 13:10:00 | 9 | 10 | 5 | 13.0 | 23 | 30 | 5 | 5.071727 | 25.951826 | 4.931274 | 2013 | 0 | 1 | 2 | 3 | 3 |
| 4 | 2010-01-04 13:15:00 | 0 | 8 | 3 | 16.0 | 22 | 36 | 3 | 43.568982 | 29.244711 | 2.710351 | 2013 | 0 | 1 | 0 | 3 | 3 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7336 | 2017-10-09 13:00:00 | 19 | 24 | 6 | 204.0 | 206 | 215 | 5 | 18.302526 | 74.317389 | 4.996175 | 2017 | 1 | 1 | 0 | 0 | 2 |
| 7337 | 2017-10-09 13:35:00 | 36 | 36 | 6 | 204.0 | 208 | 209 | 11 | 17.702804 | 60.093966 | 3.629311 | 2017 | 2 | 1 | 0 | 0 | 0 |
| 7338 | 2017-10-09 13:40:00 | 39 | 45 | 6 | 202.0 | 203 | 208 | 12 | 14.767160 | 74.734367 | 6.370513 | 2017 | 2 | 1 | 0 | 0 | 0 |
| 7339 | 2017-10-09 15:30:00 | 17 | 17 | 6 | 206.0 | 211 | 213 | 6 | 42.029730 | 73.185672 | 2.348484 | 2017 | 1 | 1 | 0 | 2 | 2 |
| 7340 | 2017-10-10 11:30:00 | 9 | 9 | 5 | 215.0 | 218 | 222 | 4 | 5.201612 | 76.617155 | 2.727325 | 2017 | 0 | 1 | 2 | 2 | 2 |
7341 rows × 17 columns
drift_frame = drift_frame.set_index('INSPECTION_TIME')
drift_frame
| CURRENT_DEMERITS | INSPECTION_DEMERITS | VIOLATIONS_RAW | FIRST_VIOLATION | SECOND_VIOLATION | THIRD_VIOLATION | NUMBER_OF_VIOLATIONS | EMPLOYEE_COUNT | MEDIAN_EMPLOYEE_AGE | MEDIAN_EMPLOYEE_TENURE | recordYear | CURRENT_GRADE_ENCODED | INSPECTION_TYPE_ENCODED | FIRST_VIOLATION_TYPE_ENCODED | SECOND_VIOLATION_TYPE_ENCODED | THIRD_VIOLATION_TYPE_ENCODED | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| INSPECTION_TIME | ||||||||||||||||
| 2010-01-04 10:15:00 | 3 | 5 | 3 | 13.0 | 31 | 35 | 3 | 15.508296 | 25.676835 | 1.849281 | 2013 | 0 | 1 | 2 | 3 | 3 |
| 2010-01-04 10:25:00 | 6 | 10 | 4 | 14.0 | 27 | 31 | 4 | 16.521002 | 28.523227 | 3.486399 | 2013 | 0 | 1 | 2 | 3 | 3 |
| 2010-01-04 11:55:00 | 0 | 3 | 3 | 24.0 | 31 | 37 | 3 | 23.598569 | 22.094623 | 3.313007 | 2013 | 0 | 1 | 3 | 3 | 3 |
| 2010-01-04 13:10:00 | 9 | 10 | 5 | 13.0 | 23 | 30 | 5 | 5.071727 | 25.951826 | 4.931274 | 2013 | 0 | 1 | 2 | 3 | 3 |
| 2010-01-04 13:15:00 | 0 | 8 | 3 | 16.0 | 22 | 36 | 3 | 43.568982 | 29.244711 | 2.710351 | 2013 | 0 | 1 | 0 | 3 | 3 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2017-10-09 13:00:00 | 19 | 24 | 6 | 204.0 | 206 | 215 | 5 | 18.302526 | 74.317389 | 4.996175 | 2017 | 1 | 1 | 0 | 0 | 2 |
| 2017-10-09 13:35:00 | 36 | 36 | 6 | 204.0 | 208 | 209 | 11 | 17.702804 | 60.093966 | 3.629311 | 2017 | 2 | 1 | 0 | 0 | 0 |
| 2017-10-09 13:40:00 | 39 | 45 | 6 | 202.0 | 203 | 208 | 12 | 14.767160 | 74.734367 | 6.370513 | 2017 | 2 | 1 | 0 | 0 | 0 |
| 2017-10-09 15:30:00 | 17 | 17 | 6 | 206.0 | 211 | 213 | 6 | 42.029730 | 73.185672 | 2.348484 | 2017 | 1 | 1 | 0 | 2 | 2 |
| 2017-10-10 11:30:00 | 9 | 9 | 5 | 215.0 | 218 | 222 | 4 | 5.201612 | 76.617155 | 2.727325 | 2017 | 0 | 1 | 2 | 2 | 2 |
7341 rows × 16 columns
plt.plot(drift_frame.index, drift_frame['VIOLATIONS_RAW'])
plt.xticks(rotation='vertical')
(array([733408., 733773., 734138., 734503., 734869., 735234., 735599.,
735964., 736330., 736695.]),
<a list of 10 Text major ticklabel objects>)
data_columns = ColumnMapping()
data_columns.numerical_features = ['CURRENT_GRADE_ENCODED', 'INSPECTION_TYPE_ENCODED', 'FIRST_VIOLATION_TYPE_ENCODED',
'CURRENT_DEMERITS', 'INSPECTION_DEMERITS', 'VIOLATIONS_RAW',
'FIRST_VIOLATION', 'SECOND_VIOLATION','THIRD_VIOLATION',
'NUMBER_OF_VIOLATIONS','EMPLOYEE_COUNT','MEDIAN_EMPLOYEE_AGE',
'MEDIAN_EMPLOYEE_TENURE'
,'SECOND_VIOLATION_TYPE_ENCODED', 'THIRD_VIOLATION_TYPE_ENCODED']
#set reference dates
reference_dates = ('2010-01-01 00:00:00','2010-12-31 23:00:00')
#set experiment batches dates
experiment_batches = [
('2010-01-01 00:00','2010-12-31 23:00:00'),
('2011-01-01 00:00','2011-12-31 23:00:00'),
('2012-01-01 00:00','2012-12-31 23:00:00'),
('2013-01-01 00:00','2013-12-31 23:00:00'),
('2014-01-01 00:00','2014-12-31 23:00:00'),
('2015-01-01 00:00','2015-12-31 23:00:00'),
('2016-01-01 00:00','2016-12-31 23:00:00'),
('2017-01-01 00:00','2017-12-31 23:00:00')
]
#evaluate data drift with Evidently Profile
def detect_dataset_drift(reference, production, column_mapping, confidence=0.95, threshold=0.5, get_ratio=False):
"""
Returns True if Data Drift is detected, else returns False.
If get_ratio is True, returns ration of drifted features.
The Data Drift detection depends on the confidence level and the threshold.
For each individual feature Data Drift is detected with the selected confidence (default value is 0.95).
Data Drift for the dataset is detected if share of the
drifted features is above the selected threshold (default value is 0.5).
"""
data_drift_profile = Profile(sections=[DataDriftProfileSection()])
data_drift_profile.calculate(reference, production, column_mapping=column_mapping)
report = data_drift_profile.json()
json_report = json.loads(report)
drifts = []
num_features = data_columns.numerical_features if data_columns.numerical_features else []
cat_features = data_columns.categorical_features if data_columns.categorical_features else []
for feature in num_features + cat_features:
drifts.append(json_report['data_drift']['data']['metrics'][feature]['p_value'])
n_features = len(drifts)
n_drifted_features = sum([1 if x<(1. - confidence) else 0 for x in drifts])
if get_ratio:
return n_drifted_features/n_features
else:
return True if n_drifted_features/n_features >= threshold else False
#evaluate data drift with Evidently Profile
def detect_features_drift(reference, production, column_mapping, confidence=0.95, threshold=0.5, get_pvalues=False):
"""
Returns 1 if Data Drift is detected, else returns 0.
If get_pvalues is True, returns p-value for each feature.
The Data Drift detection depends on the confidence level and the threshold.
For each individual feature Data Drift is detected with the selected confidence (default value is 0.95).
"""
data_drift_profile = Profile(sections=[DataDriftProfileSection()])
data_drift_profile.calculate(reference, production, column_mapping=column_mapping)
report = data_drift_profile.json()
json_report = json.loads(report)
drifts = []
num_features = data_columns.numerical_features if data_columns.numerical_features else []
cat_features = data_columns.categorical_features if data_columns.categorical_features else []
for feature in num_features + cat_features:
p_value = json_report['data_drift']['data']['metrics'][feature]['p_value']
if get_pvalues:
drifts.append((feature, p_value))
else:
drifts.append((feature, True if p_value <(1. - confidence) else False))
return drifts
drift_frame.info()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 7341 entries, 2010-01-04 10:15:00 to 2017-10-10 11:30:00 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CURRENT_DEMERITS 7341 non-null int64 1 INSPECTION_DEMERITS 7341 non-null int64 2 VIOLATIONS_RAW 7341 non-null int64 3 FIRST_VIOLATION 7341 non-null float64 4 SECOND_VIOLATION 7341 non-null int64 5 THIRD_VIOLATION 7341 non-null int64 6 NUMBER_OF_VIOLATIONS 7341 non-null int64 7 EMPLOYEE_COUNT 7341 non-null float64 8 MEDIAN_EMPLOYEE_AGE 7341 non-null float64 9 MEDIAN_EMPLOYEE_TENURE 7341 non-null float64 10 recordYear 7341 non-null int64 11 CURRENT_GRADE_ENCODED 7341 non-null int64 12 INSPECTION_TYPE_ENCODED 7341 non-null int64 13 FIRST_VIOLATION_TYPE_ENCODED 7341 non-null int64 14 SECOND_VIOLATION_TYPE_ENCODED 7341 non-null int64 15 THIRD_VIOLATION_TYPE_ENCODED 7341 non-null int64 dtypes: float64(4), int64(12) memory usage: 975.0 KB
features_historical_drift = []
for date in experiment_batches:
drifts = detect_features_drift(drift_frame.loc[reference_dates[0]:reference_dates[1]],
drift_frame.loc[date[0]:date[1]],
column_mapping=data_columns,
confidence=0.95,
threshold=0.9)
features_historical_drift.append([x[1] for x in drifts])
features_historical_drift_frame = pd.DataFrame(features_historical_drift,
columns = data_columns.numerical_features)
fig = go.Figure(data=go.Heatmap(
z = features_historical_drift_frame.astype(int).transpose(),
x = [x[1] for x in experiment_batches],
y = data_columns.numerical_features,
hoverongaps = False,
xgap = 1,
ygap = 1,
zmin = 0,
zmax = 1,
showscale = False,
colorscale = 'Bluered'
))
fig.update_xaxes(side="top")
fig.update_layout(
xaxis_title = "Timestamp",
yaxis_title = "Feature Drift"
)
fig.show()
features_historical_drift_pvalues = []
for date in experiment_batches:
drifts = detect_features_drift(drift_frame.loc[reference_dates[0]:reference_dates[1]],
drift_frame.loc[date[0]:date[1]],
column_mapping=data_columns,
confidence=0.95,
threshold=0.9,
get_pvalues=True)
features_historical_drift_pvalues.append([x[1] for x in drifts])
features_historical_drift_pvalues_frame = pd.DataFrame(features_historical_drift_pvalues,
columns = data_columns.numerical_features)
fig = go.Figure(data=go.Heatmap(
z = features_historical_drift_pvalues_frame.transpose(),
x = [x[1] for x in experiment_batches],
y = features_historical_drift_pvalues_frame.columns,
hoverongaps = False,
xgap = 1,
ygap = 1,
zmin = 0,
zmax = 1,
colorscale = 'reds_r'
)
)
fig.update_xaxes(side="top")
fig.update_layout(
xaxis_title = "Timestamp",
yaxis_title = "p-value"
)
fig.show()
dataset_historical_drift = []
for date in experiment_batches:
dataset_historical_drift.append(detect_dataset_drift(drift_frame.loc[reference_dates[0]:reference_dates[1]],
drift_frame.loc[date[0]:date[1]],
column_mapping=data_columns,
confidence=0.95,
threshold=0.8))
fig = go.Figure(data=go.Heatmap(
z = [[1 if x == True else 0 for x in dataset_historical_drift]],
x = [x[1] for x in experiment_batches],
y = [''],
hoverongaps = False,
xgap = 1,
ygap = 1,
zmin = 0,
zmax = 1,
colorscale = 'Bluered',
showscale = False
)
)
fig.update_xaxes(side="top")
fig.update_layout(
xaxis_title = "Timestamp",
yaxis_title = "Dataset Drift"
)
fig.show()
dataset_historical_drift_ratio = []
for date in experiment_batches:
dataset_historical_drift_ratio.append(detect_dataset_drift(drift_frame.loc[reference_dates[0]:reference_dates[1]],
drift_frame.loc[date[0]:date[1]],
column_mapping=data_columns,
confidence=0.95,
threshold=0.8,
get_ratio=True))
fig = go.Figure(data=go.Heatmap(
z = [dataset_historical_drift_ratio],
x = [x[1] for x in experiment_batches],
y = [''],
hoverongaps = False,
xgap = 1,
ygap = 1,
zmin = 0.5,
zmax = 1,
colorscale = 'reds'
)
)
fig.update_xaxes(side="top")
fig.update_layout(
xaxis_title = "Timestamp",
yaxis_title = "Dataset Drift"
)
fig.show()
#evaluate data drift with Evidently Profile
def detect_dataset_drift(reference, production, column_mapping, confidence=0.95, threshold=0.5, get_ratio=False):
"""
Returns True if Data Drift is detected, else returns False.
If get_ratio is True, returns ration of drifted features.
The Data Drift detection depends on the confidence level and the threshold.
For each individual feature Data Drift is detected with the selected confidence (default value is 0.95).
Data Drift for the dataset is detected if share of the drifted features is above the selected threshold (default value is 0.5).
"""
data_drift_profile = Profile(sections=[DataDriftProfileSection])
data_drift_profile.calculate(reference, production, column_mapping=column_mapping)
report = data_drift_profile.json()
json_report = json.loads(report)
drifts = []
num_features = column_mapping.get('numerical_features') if column_mapping.get('numerical_features') else []
cat_features = column_mapping.get('categorical_features') if column_mapping.get('categorical_features') else []
for feature in num_features + cat_features:
drifts.append(json_report['data_drift']['data']['metrics'][feature]['p_value'])
n_features = len(drifts)
n_drifted_features = sum([1 if x<(1. - confidence) else 0 for x in drifts])
if get_ratio:
return n_drifted_features/n_features
else:
return True if n_drifted_features/n_features >= threshold else False
REVISITING QUESTION 2
drift_frame
| CURRENT_DEMERITS | INSPECTION_DEMERITS | VIOLATIONS_RAW | FIRST_VIOLATION | SECOND_VIOLATION | THIRD_VIOLATION | NUMBER_OF_VIOLATIONS | EMPLOYEE_COUNT | MEDIAN_EMPLOYEE_AGE | MEDIAN_EMPLOYEE_TENURE | recordYear | CURRENT_GRADE_ENCODED | INSPECTION_TYPE_ENCODED | FIRST_VIOLATION_TYPE_ENCODED | SECOND_VIOLATION_TYPE_ENCODED | THIRD_VIOLATION_TYPE_ENCODED | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| INSPECTION_TIME | ||||||||||||||||
| 2010-01-04 10:15:00 | 3 | 5 | 3 | 13.0 | 31 | 35 | 3 | 15.508296 | 25.676835 | 1.849281 | 2013 | 0 | 1 | 2 | 3 | 3 |
| 2010-01-04 10:25:00 | 6 | 10 | 4 | 14.0 | 27 | 31 | 4 | 16.521002 | 28.523227 | 3.486399 | 2013 | 0 | 1 | 2 | 3 | 3 |
| 2010-01-04 11:55:00 | 0 | 3 | 3 | 24.0 | 31 | 37 | 3 | 23.598569 | 22.094623 | 3.313007 | 2013 | 0 | 1 | 3 | 3 | 3 |
| 2010-01-04 13:10:00 | 9 | 10 | 5 | 13.0 | 23 | 30 | 5 | 5.071727 | 25.951826 | 4.931274 | 2013 | 0 | 1 | 2 | 3 | 3 |
| 2010-01-04 13:15:00 | 0 | 8 | 3 | 16.0 | 22 | 36 | 3 | 43.568982 | 29.244711 | 2.710351 | 2013 | 0 | 1 | 0 | 3 | 3 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2017-10-09 13:00:00 | 19 | 24 | 6 | 204.0 | 206 | 215 | 5 | 18.302526 | 74.317389 | 4.996175 | 2017 | 1 | 1 | 0 | 0 | 2 |
| 2017-10-09 13:35:00 | 36 | 36 | 6 | 204.0 | 208 | 209 | 11 | 17.702804 | 60.093966 | 3.629311 | 2017 | 2 | 1 | 0 | 0 | 0 |
| 2017-10-09 13:40:00 | 39 | 45 | 6 | 202.0 | 203 | 208 | 12 | 14.767160 | 74.734367 | 6.370513 | 2017 | 2 | 1 | 0 | 0 | 0 |
| 2017-10-09 15:30:00 | 17 | 17 | 6 | 206.0 | 211 | 213 | 6 | 42.029730 | 73.185672 | 2.348484 | 2017 | 1 | 1 | 0 | 2 | 2 |
| 2017-10-10 11:30:00 | 9 | 9 | 5 | 215.0 | 218 | 222 | 4 | 5.201612 | 76.617155 | 2.727325 | 2017 | 0 | 1 | 2 | 2 | 2 |
7341 rows × 16 columns
#df #need to trim to 16 rows
df = df.drop(['RESTAURANT_SERIAL_NUMBER', 'RESTAURANT_PERMIT_NUMBER',
'RESTAURANT_NAME', 'RESTAURANT_LOCATION', 'RESTAURANT_CATEGORY',
'ADDRESS', 'CITY', 'STATE', 'ZIP'],axis=1)
df
| INSPECTION_TIME | CURRENT_DEMERITS | CURRENT_GRADE | EMPLOYEE_COUNT | MEDIAN_EMPLOYEE_AGE | MEDIAN_EMPLOYEE_TENURE | INSPECTION_TYPE | INSPECTION_DEMERITS | VIOLATIONS_RAW | RECORD_UPDATED | ... | SECOND_VIOLATION | THIRD_VIOLATION | FIRST_VIOLATION_TYPE | SECOND_VIOLATION_TYPE | THIRD_VIOLATION_TYPE | NUMBER_OF_VIOLATIONS | NEXT_INSPECTION_GRADE_C_OR_BELOW | year | month | day | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1/10/2012 10:25 | 3.0 | A | 32.0 | 26.942643 | 2.956749 | Routine Inspection | 20 | [202, 211, 215, 218, 224] | 2/21/2013 22:26 | ... | 211.0 | 215.0 | Critical | Major | Major | 10 | 0 | 1 | 10 | 2012 10:25 |
| 1 | 1/10/2012 11:30 | 8.0 | A | 15.0 | 32.482685 | 2.540833 | Routine Inspection | 20 | [209, 214, 215, 217, 223] | 2/21/2013 22:26 | ... | 214.0 | 215.0 | Critical | Major | Major | 8 | 0 | 1 | 10 | 2012 11:30 |
| 2 | 1/10/2012 12:10 | 0.0 | A | 11.0 | 32.357290 | 2.047184 | Routine Inspection | 19 | [202, 209, 214, 216, 228] | 2/21/2013 22:26 | ... | 209.0 | 214.0 | Critical | Critical | Major | 7 | 0 | 1 | 10 | 2012 12:10 |
| 3 | 1/10/2012 12:55 | 3.0 | A | 10.0 | 18.000000 | 2.001736 | Routine Inspection | 10 | [213, 222, 225, 227, 230] | 2/21/2013 22:26 | ... | 222.0 | 225.0 | Major | Major | Non-Major | 6 | 0 | 1 | 10 | 2012 12:55 |
| 4 | 1/10/2012 13:45 | 0.0 | A | 19.0 | 28.293079 | 2.493468 | Routine Inspection | 10 | [209, 217, 229, 230] | 2/21/2013 22:26 | ... | 217.0 | 229.0 | Critical | Major | Non-Major | 4 | 0 | 1 | 10 | 2012 13:45 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 11956 | 9/9/2016 12:25 | 9.0 | A | 4.0 | 18.000000 | 2.919516 | Routine Inspection | 9 | [212, 213, 215] | 9/12/2016 9:59 | ... | 213.0 | 215.0 | Major | Major | Major | 3 | 0 | 9 | 9 | 2016 12:25 |
| 11957 | 9/9/2016 15:10 | 8.0 | A | 26.0 | 29.479618 | 1.938329 | Routine Inspection | 19 | [2, 022, 082, 112, 908, 290] | 9/9/2016 16:05 | ... | 208.0 | 211.0 | Critical | Critical | Major | 10 | 0 | 9 | 9 | 2016 15:10 |
| 11958 | 9/9/2016 8:00 | 5.0 | A | 12.0 | 25.198991 | 3.018171 | Routine Inspection | 15 | [21, 221, 321, 529, 092, 900] | 9/9/2016 9:57 | ... | 213.0 | 215.0 | Major | Major | Major | 8 | 0 | 9 | 9 | 2016 8:00 |
| 11959 | 9/9/2016 9:00 | 8.0 | A | 36.0 | 21.344443 | 5.238711 | Re-inspection | 25 | [208, 209, 216, 217, 218] | 9/9/2016 16:26 | ... | 209.0 | 216.0 | Critical | Critical | Major | 8 | 0 | 9 | 9 | 2016 9:00 |
| 11960 | 9/9/2016 9:45 | 6.0 | A | 45.0 | 26.413659 | 7.671624 | Routine Inspection | 19 | [2, 022, 092, 112, 122, 180] | 9/9/2016 10:43 | ... | 209.0 | 211.0 | Critical | Critical | Major | 6 | 0 | 9 | 9 | 2016 9:45 |
11961 rows × 22 columns
df['CURRENT_GRADE_ENCODED'] = LabelEncoder().fit_transform(df['CURRENT_GRADE'])
df['INSPECTION_TYPE_ENCODED'] = LabelEncoder().fit_transform(df['INSPECTION_TYPE'])
df['FIRST_VIOLATION_TYPE_ENCODED'] = LabelEncoder().fit_transform(df['FIRST_VIOLATION_TYPE'])
df['SECOND_VIOLATION_TYPE_ENCODED'] = LabelEncoder().fit_transform(df['SECOND_VIOLATION_TYPE'])
df['THIRD_VIOLATION_TYPE_ENCODED'] = LabelEncoder().fit_transform(df['THIRD_VIOLATION_TYPE'])
df=df.drop(['CURRENT_GRADE','INSPECTION_TYPE','FIRST_VIOLATION_TYPE','SECOND_VIOLATION_TYPE','THIRD_VIOLATION_TYPE'],axis=1)
df
| INSPECTION_TIME | CURRENT_DEMERITS | EMPLOYEE_COUNT | MEDIAN_EMPLOYEE_AGE | MEDIAN_EMPLOYEE_TENURE | INSPECTION_DEMERITS | VIOLATIONS_RAW | RECORD_UPDATED | LAT_LONG_RAW | FIRST_VIOLATION | ... | NUMBER_OF_VIOLATIONS | NEXT_INSPECTION_GRADE_C_OR_BELOW | year | month | day | CURRENT_GRADE_ENCODED | INSPECTION_TYPE_ENCODED | FIRST_VIOLATION_TYPE_ENCODED | SECOND_VIOLATION_TYPE_ENCODED | THIRD_VIOLATION_TYPE_ENCODED | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1/10/2012 10:25 | 3.0 | 32.0 | 26.942643 | 2.956749 | 20 | [202, 211, 215, 218, 224] | 2/21/2013 22:26 | (36.1589722, 115.0469673) | 202.0 | ... | 10 | 0 | 1 | 10 | 2012 10:25 | 0 | 1 | 0 | 2 | 2 |
| 1 | 1/10/2012 11:30 | 8.0 | 15.0 | 32.482685 | 2.540833 | 20 | [209, 214, 215, 217, 223] | 2/21/2013 22:26 | (36.120164, 115.335163) | 209.0 | ... | 8 | 0 | 1 | 10 | 2012 11:30 | 0 | 1 | 0 | 2 | 2 |
| 2 | 1/10/2012 12:10 | 0.0 | 11.0 | 32.357290 | 2.047184 | 19 | [202, 209, 214, 216, 228] | 2/21/2013 22:26 | (36.1723554, 115.1160331) | 202.0 | ... | 7 | 0 | 1 | 10 | 2012 12:10 | 0 | 1 | 0 | 0 | 2 |
| 3 | 1/10/2012 12:55 | 3.0 | 10.0 | 18.000000 | 2.001736 | 10 | [213, 222, 225, 227, 230] | 2/21/2013 22:26 | (36.2057231, 115.1129594) | 213.0 | ... | 6 | 0 | 1 | 10 | 2012 12:55 | 0 | 1 | 2 | 2 | 3 |
| 4 | 1/10/2012 13:45 | 0.0 | 19.0 | 28.293079 | 2.493468 | 10 | [209, 217, 229, 230] | 2/21/2013 22:26 | (36.1022507, 115.1699679) | 209.0 | ... | 4 | 0 | 1 | 10 | 2012 13:45 | 0 | 1 | 0 | 2 | 3 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 11956 | 9/9/2016 12:25 | 9.0 | 4.0 | 18.000000 | 2.919516 | 9 | [212, 213, 215] | 9/12/2016 9:59 | (36.239207, 115.153386) | 212.0 | ... | 3 | 0 | 9 | 9 | 2016 12:25 | 0 | 1 | 2 | 2 | 2 |
| 11957 | 9/9/2016 15:10 | 8.0 | 26.0 | 29.479618 | 1.938329 | 19 | [2, 022, 082, 112, 908, 290] | 9/9/2016 16:05 | (36.1692635, 115.1406902) | 202.0 | ... | 10 | 0 | 9 | 9 | 2016 15:10 | 0 | 1 | 0 | 0 | 2 |
| 11958 | 9/9/2016 8:00 | 5.0 | 12.0 | 25.198991 | 3.018171 | 15 | [21, 221, 321, 529, 092, 900] | 9/9/2016 9:57 | (36.107315, 115.170976) | 212.0 | ... | 8 | 0 | 9 | 9 | 2016 8:00 | 0 | 1 | 2 | 2 | 2 |
| 11959 | 9/9/2016 9:00 | 8.0 | 36.0 | 21.344443 | 5.238711 | 25 | [208, 209, 216, 217, 218] | 9/9/2016 16:26 | (36.1587543, 115.2095993) | 208.0 | ... | 8 | 0 | 9 | 9 | 2016 9:00 | 0 | 0 | 0 | 0 | 2 |
| 11960 | 9/9/2016 9:45 | 6.0 | 45.0 | 26.413659 | 7.671624 | 19 | [2, 022, 092, 112, 122, 180] | 9/9/2016 10:43 | (36.100462, 115.224075) | 202.0 | ... | 6 | 0 | 9 | 9 | 2016 9:45 | 0 | 1 | 0 | 0 | 2 |
11961 rows × 22 columns
df = df.drop(['LAT_LONG_RAW','RECORD_UPDATED'],axis=1)
df[['date','time']] = df.INSPECTION_TIME.str.split(expand=True)
df['INSPECTION_TIME'] = (pd.to_datetime(df.pop('date'), format='%m/%d/%Y') +
pd.to_timedelta(df.pop('time') + ':00'))
VIOLATIONS_RAW_COUNT2=[]
for each in df.VIOLATIONS_RAW:
count=len(set(each))
VIOLATIONS_RAW_COUNT2.append(count)
VIOLATIONS_RAW_COUNT2
df.VIOLATIONS_RAW=VIOLATIONS_RAW_COUNT2
df = df.set_index('INSPECTION_TIME')
df = df.drop(['year','month','day'],axis=1)
NEXT_INSPECTION_GRADE_C_OR_BELOW_FLOAT=[]
for each in df.NEXT_INSPECTION_GRADE_C_OR_BELOW:
flot=float(each)
NEXT_INSPECTION_GRADE_C_OR_BELOW_FLOAT.append(flot)
NEXT_INSPECTION_GRADE_C_OR_BELOW_FLOAT
df.NEXT_INSPECTION_GRADE_C_OR_BELOW=NEXT_INSPECTION_GRADE_C_OR_BELOW_FLOAT
df
| CURRENT_DEMERITS | EMPLOYEE_COUNT | MEDIAN_EMPLOYEE_AGE | MEDIAN_EMPLOYEE_TENURE | INSPECTION_DEMERITS | VIOLATIONS_RAW | FIRST_VIOLATION | SECOND_VIOLATION | THIRD_VIOLATION | NUMBER_OF_VIOLATIONS | NEXT_INSPECTION_GRADE_C_OR_BELOW | CURRENT_GRADE_ENCODED | INSPECTION_TYPE_ENCODED | FIRST_VIOLATION_TYPE_ENCODED | SECOND_VIOLATION_TYPE_ENCODED | THIRD_VIOLATION_TYPE_ENCODED | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| INSPECTION_TIME | ||||||||||||||||
| 2012-01-10 10:25:00 | 3.0 | 32.0 | 26.942643 | 2.956749 | 20 | 5 | 202.0 | 211.0 | 215.0 | 10 | 0.0 | 0 | 1 | 0 | 2 | 2 |
| 2012-01-10 11:30:00 | 8.0 | 15.0 | 32.482685 | 2.540833 | 20 | 5 | 209.0 | 214.0 | 215.0 | 8 | 0.0 | 0 | 1 | 0 | 2 | 2 |
| 2012-01-10 12:10:00 | 0.0 | 11.0 | 32.357290 | 2.047184 | 19 | 5 | 202.0 | 209.0 | 214.0 | 7 | 0.0 | 0 | 1 | 0 | 0 | 2 |
| 2012-01-10 12:55:00 | 3.0 | 10.0 | 18.000000 | 2.001736 | 10 | 5 | 213.0 | 222.0 | 225.0 | 6 | 0.0 | 0 | 1 | 2 | 2 | 3 |
| 2012-01-10 13:45:00 | 0.0 | 19.0 | 28.293079 | 2.493468 | 10 | 4 | 209.0 | 217.0 | 229.0 | 4 | 0.0 | 0 | 1 | 0 | 2 | 3 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2016-09-09 12:25:00 | 9.0 | 4.0 | 18.000000 | 2.919516 | 9 | 3 | 212.0 | 213.0 | 215.0 | 3 | 0.0 | 0 | 1 | 2 | 2 | 2 |
| 2016-09-09 15:10:00 | 8.0 | 26.0 | 29.479618 | 1.938329 | 19 | 6 | 202.0 | 208.0 | 211.0 | 10 | 0.0 | 0 | 1 | 0 | 0 | 2 |
| 2016-09-09 08:00:00 | 5.0 | 12.0 | 25.198991 | 3.018171 | 15 | 6 | 212.0 | 213.0 | 215.0 | 8 | 0.0 | 0 | 1 | 2 | 2 | 2 |
| 2016-09-09 09:00:00 | 8.0 | 36.0 | 21.344443 | 5.238711 | 25 | 5 | 208.0 | 209.0 | 216.0 | 8 | 0.0 | 0 | 0 | 0 | 0 | 2 |
| 2016-09-09 09:45:00 | 6.0 | 45.0 | 26.413659 | 7.671624 | 19 | 6 | 202.0 | 209.0 | 211.0 | 6 | 0.0 | 0 | 1 | 0 | 0 | 2 |
11961 rows × 16 columns
drift_frame=drift_frame.drop(['recordYear'],axis=1)
drift_frame.columns
Index(['CURRENT_DEMERITS', 'INSPECTION_DEMERITS', 'VIOLATIONS_RAW',
'FIRST_VIOLATION', 'SECOND_VIOLATION', 'THIRD_VIOLATION',
'NUMBER_OF_VIOLATIONS', 'EMPLOYEE_COUNT', 'MEDIAN_EMPLOYEE_AGE',
'MEDIAN_EMPLOYEE_TENURE', 'CURRENT_GRADE_ENCODED',
'INSPECTION_TYPE_ENCODED', 'FIRST_VIOLATION_TYPE_ENCODED',
'SECOND_VIOLATION_TYPE_ENCODED', 'THIRD_VIOLATION_TYPE_ENCODED'],
dtype='object')
df=df[['CURRENT_DEMERITS', 'INSPECTION_DEMERITS', 'VIOLATIONS_RAW',
'FIRST_VIOLATION', 'SECOND_VIOLATION', 'THIRD_VIOLATION',
'NUMBER_OF_VIOLATIONS', 'EMPLOYEE_COUNT', 'MEDIAN_EMPLOYEE_AGE',
'MEDIAN_EMPLOYEE_TENURE', 'CURRENT_GRADE_ENCODED',
'INSPECTION_TYPE_ENCODED', 'FIRST_VIOLATION_TYPE_ENCODED',
'SECOND_VIOLATION_TYPE_ENCODED', 'THIRD_VIOLATION_TYPE_ENCODED','NEXT_INSPECTION_GRADE_C_OR_BELOW']]
ref_data = df
prod_data = drift_frame
ref_data = df
prod_data = drift_frame
# def plotPerColumnDistribution(df, nGraphShown, nGraphPerRow):
# nunique = df.nunique()
# df = df[[col for col in df if nunique[col] > 1 and nunique[col] < 50]] # For displaying purposes, pick columns that have between 1 and 50 unique values
# nRow, nCol = df.shape
# columnNames = list(df)
# nGraphRow = (nCol + nGraphPerRow - 1) / nGraphPerRow
# plt.figure(num = None, figsize = (6 * nGraphPerRow, 8 * nGraphRow), dpi = 80, facecolor = 'w', edgecolor = 'k')
# for i in range(min(nCol, nGraphShown)):
# plt.subplot(nGraphRow, nGraphPerRow, i + 1)
# columnDf = df.iloc[:, i]
# if (not np.issubdtype(type(columnDf.iloc[0]), np.number)):
# valueCounts = columnDf.value_counts()
# valueCounts.plot.bar()
# else:
# columnDf.hist()
# plt.ylabel('counts')
# plt.xticks(rotation = 90)
# plt.title(f'{columnNames[i]} (column {i})')
# plt.tight_layout(pad = 1.0, w_pad = 1.0, h_pad = 1.0)
# plt.show()
# plotPerColumnDistribution(df, 10, 5)
def plotCorrelationMatrix(df, graphWidth):
df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
if df.shape[1] < 2:
return
corr = df.corr()
plt.figure(num=None, figsize=(graphWidth, graphWidth), dpi=80, facecolor='w', edgecolor='k')
corrMat = plt.matshow(corr, fignum = 1)
plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
plt.yticks(range(len(corr.columns)), corr.columns)
plt.gca().xaxis.tick_bottom()
plt.colorbar(corrMat)
#plt.title(f'Correlation Matrix for {df}', fontsize=15)
plt.show()
plotCorrelationMatrix(df, 8)
# # Scatter and density plots
# def plotScatterMatrix(df, plotSize, textSize):
# df = df.select_dtypes(include =[np.number]) # keep only numerical columns
# # Remove rows and columns that would lead to df being singular
# df = df.dropna('columns')
# df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
# columnNames = list(df)
# if len(columnNames) > 10: # reduce the number of columns for matrix inversion of kernel density plots
# columnNames = columnNames[:10]
# df = df[columnNames]
# ax = pd.plotting.scatter_matrix(df, alpha=0.75, figsize=[plotSize, plotSize], diagonal='kde')
# corrs = df.corr().values
# for i, j in zip(*plt.np.triu_indices_from(ax, k = 1)):
# ax[i, j].annotate('Corr. coef = %.3f' % corrs[i, j], (0.8, 0.2), xycoords='axes fraction', ha='center', va='center', size=textSize)
# plt.suptitle('Scatter and Density Plot')
# plt.show()
# plotScatterMatrix(df, 9, 10)
continuous = ['MEDIAN_EMPLOYEE_AGE', 'MEDIAN_EMPLOYEE_TENURE']
nominal = ['CURRENT_GRADE_ENCODED',
'INSPECTION_TYPE_ENCODED','FIRST_VIOLATION', 'SECOND_VIOLATION',
'THIRD_VIOLATION','FIRST_VIOLATION_TYPE_ENCODED','SECOND_VIOLATION_TYPE_ENCODED','THIRD_VIOLATION_TYPE_ENCODED']
numeric = ['CURRENT_DEMERITS', 'EMPLOYEE_COUNT', 'INSPECTION_DEMERITS',
'NUMBER_OF_VIOLATIONS','VIOLATIONS_RAW']
target = ['NEXT_INSPECTION_GRADE_C_OR_BELOW']
features = nominal+ numeric+ continuous+ target
print(df.shape)
print(df.dtypes)
# Prnit out the unique values of selected_features
for each in features:
tmp = df[each].unique()
# Null values Handling
print(df.isnull().values.any()) # Is there any null value?
print(df.isnull().sum()) # Print the number of null value for each feature
#drop prediction for sake of restarting on part 2
(11961, 16) CURRENT_DEMERITS float64 INSPECTION_DEMERITS object VIOLATIONS_RAW int64 FIRST_VIOLATION float64 SECOND_VIOLATION float64 THIRD_VIOLATION float64 NUMBER_OF_VIOLATIONS object EMPLOYEE_COUNT float64 MEDIAN_EMPLOYEE_AGE float64 MEDIAN_EMPLOYEE_TENURE float64 CURRENT_GRADE_ENCODED int64 INSPECTION_TYPE_ENCODED int64 FIRST_VIOLATION_TYPE_ENCODED int64 SECOND_VIOLATION_TYPE_ENCODED int64 THIRD_VIOLATION_TYPE_ENCODED int64 NEXT_INSPECTION_GRADE_C_OR_BELOW float64 dtype: object False CURRENT_DEMERITS 0 INSPECTION_DEMERITS 0 VIOLATIONS_RAW 0 FIRST_VIOLATION 0 SECOND_VIOLATION 0 THIRD_VIOLATION 0 NUMBER_OF_VIOLATIONS 0 EMPLOYEE_COUNT 0 MEDIAN_EMPLOYEE_AGE 0 MEDIAN_EMPLOYEE_TENURE 0 CURRENT_GRADE_ENCODED 0 INSPECTION_TYPE_ENCODED 0 FIRST_VIOLATION_TYPE_ENCODED 0 SECOND_VIOLATION_TYPE_ENCODED 0 THIRD_VIOLATION_TYPE_ENCODED 0 NEXT_INSPECTION_GRADE_C_OR_BELOW 0 dtype: int64
#df.columns #no prediction
# Get the stats
for each in numeric:
print(each)
print(df[each].describe())
print('median', df[each].median())
print('mode', df[each].mode())
print('mean', df[each].mean())
CURRENT_DEMERITS count 11961.000000 mean 4.882117 std 4.294643 min 0.000000 25% 0.000000 50% 5.000000 75% 8.000000 max 100.000000 Name: CURRENT_DEMERITS, dtype: float64 median 5.0 mode 0 0.0 dtype: float64 mean 4.882116879859543 EMPLOYEE_COUNT count 11961.000000 mean 24.405903 std 1018.944400 min -7.000000 25% 8.000000 50% 14.000000 75% 21.000000 max 111447.000000 Name: EMPLOYEE_COUNT, dtype: float64 median 14.0 mode 0 3.0 dtype: float64 mean 24.405902516511997 INSPECTION_DEMERITS count 11961 unique 61 top 10 freq 1505 Name: INSPECTION_DEMERITS, dtype: object median 10.0 mode 0 10 dtype: object mean inf NUMBER_OF_VIOLATIONS count 11961 unique 26 top 3 freq 2853 Name: NUMBER_OF_VIOLATIONS, dtype: object median 5.0 mode 0 3 dtype: object mean inf VIOLATIONS_RAW count 11961.000000 mean 4.645598 std 1.408191 min 3.000000 25% 4.000000 50% 5.000000 75% 5.000000 max 25.000000 Name: VIOLATIONS_RAW, dtype: float64 median 5.0 mode 0 5 dtype: int64 mean 4.645598194130925
df = df[df['NEXT_INSPECTION_GRADE_C_OR_BELOW'].isin(["0", "1"])]
if 'FIRST_VIOLATION' in features:
df = df[(0 < df['FIRST_VIOLATION']) & (df['FIRST_VIOLATION'] < 350)]
if 'SECOND_VIOLATION' in features:
df = df[(0 < df['SECOND_VIOLATION']) & (df['SECOND_VIOLATION'] < 350)]
if 'THIRD_VIOLATION' in features:
df = df[(0 < df['THIRD_VIOLATION']) & (df['THIRD_VIOLATION'] < 350)]
if 'CURRENT_DEMERITS' in features:
df = df[(0 <= df['CURRENT_DEMERITS']) & (df['CURRENT_DEMERITS'] < 250)]
if 'EMPLOYEE_COUNT' in features:
df = df[(0 < df['EMPLOYEE_COUNT']) & (df['EMPLOYEE_COUNT'] < 150)]
from sklearn import preprocessing
df_disc = pd.DataFrame()
# Discretization
for each in continuous:
disc = pd.cut(df[each], bins=10, labels=np.arange(10), right=False)
df_disc = pd.concat([df_disc, disc], axis=1)
# Concatenate numeric features and discretized features
for each in numeric:
df_disc = pd.concat([df_disc, df[each]], axis=1)
# Normalization
x = df_disc.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_norm = pd.DataFrame(x_scaled, columns=df_disc.columns, index=df_disc.index)
df_new = pd.concat([df, df_norm], axis=1)
df_new
| CURRENT_DEMERITS | INSPECTION_DEMERITS | VIOLATIONS_RAW | FIRST_VIOLATION | SECOND_VIOLATION | THIRD_VIOLATION | NUMBER_OF_VIOLATIONS | EMPLOYEE_COUNT | MEDIAN_EMPLOYEE_AGE | MEDIAN_EMPLOYEE_TENURE | ... | SECOND_VIOLATION_TYPE_ENCODED | THIRD_VIOLATION_TYPE_ENCODED | NEXT_INSPECTION_GRADE_C_OR_BELOW | MEDIAN_EMPLOYEE_AGE | MEDIAN_EMPLOYEE_TENURE | CURRENT_DEMERITS | EMPLOYEE_COUNT | INSPECTION_DEMERITS | NUMBER_OF_VIOLATIONS | VIOLATIONS_RAW | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2012-01-10 10:25:00 | 3.0 | 20 | 5 | 202.0 | 211.0 | 215.0 | 10 | 32.0 | 26.942643 | 2.956749 | ... | 2 | 2 | 0.0 | 0.222222 | 0.111111 | 0.03 | 0.591837 | 0.232558 | 0.179487 | 0.090909 |
| 2012-01-10 11:30:00 | 8.0 | 20 | 5 | 209.0 | 214.0 | 215.0 | 8 | 15.0 | 32.482685 | 2.540833 | ... | 2 | 2 | 0.0 | 0.444444 | 0.111111 | 0.08 | 0.244898 | 0.232558 | 0.128205 | 0.090909 |
| 2012-01-10 12:10:00 | 0.0 | 19 | 5 | 202.0 | 209.0 | 214.0 | 7 | 11.0 | 32.357290 | 2.047184 | ... | 0 | 2 | 0.0 | 0.444444 | 0.111111 | 0.00 | 0.163265 | 0.220930 | 0.102564 | 0.090909 |
| 2012-01-10 12:55:00 | 3.0 | 10 | 5 | 213.0 | 222.0 | 225.0 | 6 | 10.0 | 18.000000 | 2.001736 | ... | 2 | 3 | 0.0 | 0.000000 | 0.111111 | 0.03 | 0.142857 | 0.116279 | 0.076923 | 0.090909 |
| 2012-01-10 13:45:00 | 0.0 | 10 | 4 | 209.0 | 217.0 | 229.0 | 4 | 19.0 | 28.293079 | 2.493468 | ... | 2 | 3 | 0.0 | 0.333333 | 0.111111 | 0.00 | 0.326531 | 0.116279 | 0.025641 | 0.045455 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2016-09-09 12:25:00 | 9.0 | 9 | 3 | 212.0 | 213.0 | 215.0 | 3 | 4.0 | 18.000000 | 2.919516 | ... | 2 | 2 | 0.0 | 0.000000 | 0.111111 | 0.09 | 0.020408 | 0.104651 | 0.000000 | 0.000000 |
| 2016-09-09 15:10:00 | 8.0 | 19 | 6 | 202.0 | 208.0 | 211.0 | 10 | 26.0 | 29.479618 | 1.938329 | ... | 0 | 2 | 0.0 | 0.333333 | 0.111111 | 0.08 | 0.469388 | 0.220930 | 0.179487 | 0.136364 |
| 2016-09-09 08:00:00 | 5.0 | 15 | 6 | 212.0 | 213.0 | 215.0 | 8 | 12.0 | 25.198991 | 3.018171 | ... | 2 | 2 | 0.0 | 0.222222 | 0.111111 | 0.05 | 0.183673 | 0.174419 | 0.128205 | 0.136364 |
| 2016-09-09 09:00:00 | 8.0 | 25 | 5 | 208.0 | 209.0 | 216.0 | 8 | 36.0 | 21.344443 | 5.238711 | ... | 0 | 2 | 0.0 | 0.111111 | 0.333333 | 0.08 | 0.673469 | 0.290698 | 0.128205 | 0.090909 |
| 2016-09-09 09:45:00 | 6.0 | 19 | 6 | 202.0 | 209.0 | 211.0 | 6 | 45.0 | 26.413659 | 7.671624 | ... | 0 | 2 | 0.0 | 0.222222 | 0.444444 | 0.06 | 0.857143 | 0.220930 | 0.076923 | 0.136364 |
11958 rows × 23 columns
# Train_Set and Test_Set import, select desired features, and preprocessing
# Train_Set and Test_Set import
df['train'] = 'Yes'
#Now to follow same processing steps with Test Data
nominal = ['CURRENT_GRADE_ENCODED',
'INSPECTION_TYPE_ENCODED','FIRST_VIOLATION', 'SECOND_VIOLATION',
'THIRD_VIOLATION','FIRST_VIOLATION_TYPE_ENCODED','SECOND_VIOLATION_TYPE_ENCODED','THIRD_VIOLATION_TYPE_ENCODED']
numeric = ['CURRENT_DEMERITS', 'EMPLOYEE_COUNT', 'INSPECTION_DEMERITS',
'NUMBER_OF_VIOLATIONS','VIOLATIONS_RAW']
continuous = ['MEDIAN_EMPLOYEE_AGE', 'MEDIAN_EMPLOYEE_TENURE']
features = nominal+ numeric+ continuous
drift_frame
| CURRENT_DEMERITS | INSPECTION_DEMERITS | VIOLATIONS_RAW | FIRST_VIOLATION | SECOND_VIOLATION | THIRD_VIOLATION | NUMBER_OF_VIOLATIONS | EMPLOYEE_COUNT | MEDIAN_EMPLOYEE_AGE | MEDIAN_EMPLOYEE_TENURE | CURRENT_GRADE_ENCODED | INSPECTION_TYPE_ENCODED | FIRST_VIOLATION_TYPE_ENCODED | SECOND_VIOLATION_TYPE_ENCODED | THIRD_VIOLATION_TYPE_ENCODED | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| INSPECTION_TIME | |||||||||||||||
| 2010-01-04 10:15:00 | 3 | 5 | 3 | 13.0 | 31 | 35 | 3 | 15.508296 | 25.676835 | 1.849281 | 0 | 1 | 2 | 3 | 3 |
| 2010-01-04 10:25:00 | 6 | 10 | 4 | 14.0 | 27 | 31 | 4 | 16.521002 | 28.523227 | 3.486399 | 0 | 1 | 2 | 3 | 3 |
| 2010-01-04 11:55:00 | 0 | 3 | 3 | 24.0 | 31 | 37 | 3 | 23.598569 | 22.094623 | 3.313007 | 0 | 1 | 3 | 3 | 3 |
| 2010-01-04 13:10:00 | 9 | 10 | 5 | 13.0 | 23 | 30 | 5 | 5.071727 | 25.951826 | 4.931274 | 0 | 1 | 2 | 3 | 3 |
| 2010-01-04 13:15:00 | 0 | 8 | 3 | 16.0 | 22 | 36 | 3 | 43.568982 | 29.244711 | 2.710351 | 0 | 1 | 0 | 3 | 3 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2017-10-09 13:00:00 | 19 | 24 | 6 | 204.0 | 206 | 215 | 5 | 18.302526 | 74.317389 | 4.996175 | 1 | 1 | 0 | 0 | 2 |
| 2017-10-09 13:35:00 | 36 | 36 | 6 | 204.0 | 208 | 209 | 11 | 17.702804 | 60.093966 | 3.629311 | 2 | 1 | 0 | 0 | 0 |
| 2017-10-09 13:40:00 | 39 | 45 | 6 | 202.0 | 203 | 208 | 12 | 14.767160 | 74.734367 | 6.370513 | 2 | 1 | 0 | 0 | 0 |
| 2017-10-09 15:30:00 | 17 | 17 | 6 | 206.0 | 211 | 213 | 6 | 42.029730 | 73.185672 | 2.348484 | 1 | 1 | 0 | 2 | 2 |
| 2017-10-10 11:30:00 | 9 | 9 | 5 | 215.0 | 218 | 222 | 4 | 5.201612 | 76.617155 | 2.727325 | 0 | 1 | 2 | 2 | 2 |
7341 rows × 15 columns
#drift_frame
print(drift_frame.shape)
# Prnit out the unique values of selected_features
for each in features:
#print(each)
tmp = drift_frame[each].unique()
#print((tmp))
#print(drift_frame[each].value_counts(dropna=False))
# Null values Handling
#print(drift_frame.isnull().values.any()) # Is there any null value?
#print(drift_frame.isnull().sum()) # Print the number of null value for each feature
#drop prediction for sake of restarting on part 2
(7341, 15)
if 'FIRST_VIOLATION' in features:
drift_frame = drift_frame[(0 < drift_frame['FIRST_VIOLATION']) & (drift_frame['FIRST_VIOLATION'] < 350)]
if 'SECOND_VIOLATION' in features:
drift_frame = drift_frame[(0 < drift_frame['SECOND_VIOLATION']) & (drift_frame['SECOND_VIOLATION'] < 350)]
if 'THIRD_VIOLATION' in features:
drift_frame = drift_frame[(0 < drift_frame['THIRD_VIOLATION']) & (drift_frame['THIRD_VIOLATION'] < 350)]
if 'CURRENT_DEMERITS' in features:
drift_frame = drift_frame[(0 <= drift_frame['CURRENT_DEMERITS']) & (drift_frame['CURRENT_DEMERITS'] < 250)]
if 'EMPLOYEE_COUNT' in features:
drift_frame = drift_frame[(0 < drift_frame['EMPLOYEE_COUNT']) & (drift_frame['EMPLOYEE_COUNT'] < 150)]
drift_frame
| CURRENT_DEMERITS | INSPECTION_DEMERITS | VIOLATIONS_RAW | FIRST_VIOLATION | SECOND_VIOLATION | THIRD_VIOLATION | NUMBER_OF_VIOLATIONS | EMPLOYEE_COUNT | MEDIAN_EMPLOYEE_AGE | MEDIAN_EMPLOYEE_TENURE | CURRENT_GRADE_ENCODED | INSPECTION_TYPE_ENCODED | FIRST_VIOLATION_TYPE_ENCODED | SECOND_VIOLATION_TYPE_ENCODED | THIRD_VIOLATION_TYPE_ENCODED | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| INSPECTION_TIME | |||||||||||||||
| 2010-01-04 10:15:00 | 3 | 5 | 3 | 13.0 | 31 | 35 | 3 | 15.508296 | 25.676835 | 1.849281 | 0 | 1 | 2 | 3 | 3 |
| 2010-01-04 10:25:00 | 6 | 10 | 4 | 14.0 | 27 | 31 | 4 | 16.521002 | 28.523227 | 3.486399 | 0 | 1 | 2 | 3 | 3 |
| 2010-01-04 11:55:00 | 0 | 3 | 3 | 24.0 | 31 | 37 | 3 | 23.598569 | 22.094623 | 3.313007 | 0 | 1 | 3 | 3 | 3 |
| 2010-01-04 13:10:00 | 9 | 10 | 5 | 13.0 | 23 | 30 | 5 | 5.071727 | 25.951826 | 4.931274 | 0 | 1 | 2 | 3 | 3 |
| 2010-01-04 13:15:00 | 0 | 8 | 3 | 16.0 | 22 | 36 | 3 | 43.568982 | 29.244711 | 2.710351 | 0 | 1 | 0 | 3 | 3 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2017-10-09 13:00:00 | 19 | 24 | 6 | 204.0 | 206 | 215 | 5 | 18.302526 | 74.317389 | 4.996175 | 1 | 1 | 0 | 0 | 2 |
| 2017-10-09 13:35:00 | 36 | 36 | 6 | 204.0 | 208 | 209 | 11 | 17.702804 | 60.093966 | 3.629311 | 2 | 1 | 0 | 0 | 0 |
| 2017-10-09 13:40:00 | 39 | 45 | 6 | 202.0 | 203 | 208 | 12 | 14.767160 | 74.734367 | 6.370513 | 2 | 1 | 0 | 0 | 0 |
| 2017-10-09 15:30:00 | 17 | 17 | 6 | 206.0 | 211 | 213 | 6 | 42.029730 | 73.185672 | 2.348484 | 1 | 1 | 0 | 2 | 2 |
| 2017-10-10 11:30:00 | 9 | 9 | 5 | 215.0 | 218 | 222 | 4 | 5.201612 | 76.617155 | 2.727325 | 0 | 1 | 2 | 2 | 2 |
7337 rows × 15 columns
df['Train'] = 'Yes'
# Train_Set and Test_Set import, select desired features, and preprocessing
# Train_Set and Test_Set import
drift_frame[target] = "0"
drift_frame['Train'] = 'No'
# Concatenate Train and Test set
finalDF = df.append(drift_frame)
finalDF
| CURRENT_DEMERITS | INSPECTION_DEMERITS | VIOLATIONS_RAW | FIRST_VIOLATION | SECOND_VIOLATION | THIRD_VIOLATION | NUMBER_OF_VIOLATIONS | EMPLOYEE_COUNT | MEDIAN_EMPLOYEE_AGE | MEDIAN_EMPLOYEE_TENURE | CURRENT_GRADE_ENCODED | INSPECTION_TYPE_ENCODED | FIRST_VIOLATION_TYPE_ENCODED | SECOND_VIOLATION_TYPE_ENCODED | THIRD_VIOLATION_TYPE_ENCODED | NEXT_INSPECTION_GRADE_C_OR_BELOW | train | Train | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| INSPECTION_TIME | ||||||||||||||||||
| 2012-01-10 10:25:00 | 3.0 | 20 | 5 | 202.0 | 211.0 | 215.0 | 10 | 32.000000 | 26.942643 | 2.956749 | 0 | 1 | 0 | 2 | 2 | 0 | Yes | Yes |
| 2012-01-10 11:30:00 | 8.0 | 20 | 5 | 209.0 | 214.0 | 215.0 | 8 | 15.000000 | 32.482685 | 2.540833 | 0 | 1 | 0 | 2 | 2 | 0 | Yes | Yes |
| 2012-01-10 12:10:00 | 0.0 | 19 | 5 | 202.0 | 209.0 | 214.0 | 7 | 11.000000 | 32.357290 | 2.047184 | 0 | 1 | 0 | 0 | 2 | 0 | Yes | Yes |
| 2012-01-10 12:55:00 | 3.0 | 10 | 5 | 213.0 | 222.0 | 225.0 | 6 | 10.000000 | 18.000000 | 2.001736 | 0 | 1 | 2 | 2 | 3 | 0 | Yes | Yes |
| 2012-01-10 13:45:00 | 0.0 | 10 | 4 | 209.0 | 217.0 | 229.0 | 4 | 19.000000 | 28.293079 | 2.493468 | 0 | 1 | 0 | 2 | 3 | 0 | Yes | Yes |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2017-10-09 13:00:00 | 19.0 | 24 | 6 | 204.0 | 206.0 | 215.0 | 5 | 18.302526 | 74.317389 | 4.996175 | 1 | 1 | 0 | 0 | 2 | 0 | NaN | No |
| 2017-10-09 13:35:00 | 36.0 | 36 | 6 | 204.0 | 208.0 | 209.0 | 11 | 17.702804 | 60.093966 | 3.629311 | 2 | 1 | 0 | 0 | 0 | 0 | NaN | No |
| 2017-10-09 13:40:00 | 39.0 | 45 | 6 | 202.0 | 203.0 | 208.0 | 12 | 14.767160 | 74.734367 | 6.370513 | 2 | 1 | 0 | 0 | 0 | 0 | NaN | No |
| 2017-10-09 15:30:00 | 17.0 | 17 | 6 | 206.0 | 211.0 | 213.0 | 6 | 42.029730 | 73.185672 | 2.348484 | 1 | 1 | 0 | 2 | 2 | 0 | NaN | No |
| 2017-10-10 11:30:00 | 9.0 | 9 | 5 | 215.0 | 218.0 | 222.0 | 4 | 5.201612 | 76.617155 | 2.727325 | 0 | 1 | 2 | 2 | 2 | 0 | NaN | No |
19295 rows × 18 columns
def preprocessing_(df):
# Prnit out the unique values of selected_features
for each in features:
tmp = df[each].unique()
df = df.dropna(how='all') #Drop Row/Column Only if All the Values are Null
# X = preprocessing.StandardScaler().fit(df).transform(df)
df_new = pd.DataFrame()
# Normalization
x = df_disc.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_norm = pd.DataFrame(x_scaled, columns=df_disc.columns, index=df_disc.index)
df_new = pd.concat([df_new, df_norm], axis=1)
return df, df_new
df = finalDF
df, df_new = preprocessing_(df)
# Separate Train and Test set
df_tst_ = df[df['Train']=='No']
df = df[df['Train']=='Yes']
df_new_tst = df_new.iloc[len(df):,:]
df_new = df_new.iloc[:len(df),:]
#***********************************************
# Specify features columns
X = df_new
# Specify target column
y = df['NEXT_INSPECTION_GRADE_C_OR_BELOW']
y=y.astype('int')
X
| MEDIAN_EMPLOYEE_AGE | MEDIAN_EMPLOYEE_TENURE | CURRENT_DEMERITS | EMPLOYEE_COUNT | INSPECTION_DEMERITS | NUMBER_OF_VIOLATIONS | VIOLATIONS_RAW | |
|---|---|---|---|---|---|---|---|
| 2012-01-10 10:25:00 | 0.222222 | 0.111111 | 0.03 | 0.591837 | 0.232558 | 0.179487 | 0.090909 |
| 2012-01-10 11:30:00 | 0.444444 | 0.111111 | 0.08 | 0.244898 | 0.232558 | 0.128205 | 0.090909 |
| 2012-01-10 12:10:00 | 0.444444 | 0.111111 | 0.00 | 0.163265 | 0.220930 | 0.102564 | 0.090909 |
| 2012-01-10 12:55:00 | 0.000000 | 0.111111 | 0.03 | 0.142857 | 0.116279 | 0.076923 | 0.090909 |
| 2012-01-10 13:45:00 | 0.333333 | 0.111111 | 0.00 | 0.326531 | 0.116279 | 0.025641 | 0.045455 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 2016-09-09 12:25:00 | 0.000000 | 0.111111 | 0.09 | 0.020408 | 0.104651 | 0.000000 | 0.000000 |
| 2016-09-09 15:10:00 | 0.333333 | 0.111111 | 0.08 | 0.469388 | 0.220930 | 0.179487 | 0.136364 |
| 2016-09-09 08:00:00 | 0.222222 | 0.111111 | 0.05 | 0.183673 | 0.174419 | 0.128205 | 0.136364 |
| 2016-09-09 09:00:00 | 0.111111 | 0.333333 | 0.08 | 0.673469 | 0.290698 | 0.128205 | 0.090909 |
| 2016-09-09 09:45:00 | 0.222222 | 0.444444 | 0.06 | 0.857143 | 0.220930 | 0.076923 | 0.136364 |
11958 rows × 7 columns
# # Split dataframe into training + test + validation
splitter=StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=RS)
for train,test in splitter.split(X,y): #index split
X_train = X.iloc[train] #creates x train
y_train = y.iloc[train] #creates y train
X_test = X.iloc[test]
y_test = y.iloc[test]
sns.countplot(y_train).set_title("Outcome Count")
plt.show()
# summarize the new class distribution
from collections import Counter
total = Counter(y_train)
print(total)
Counter({0: 8044, 1: 1522})
X_train.shape, X_test.shape
((9566, 7), (2392, 7))
cols = X_train.columns
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train = pd.DataFrame(X_train, columns=[cols])
X_test = pd.DataFrame(X_test, columns=[cols])
X_train.describe()
| MEDIAN_EMPLOYEE_AGE | MEDIAN_EMPLOYEE_TENURE | CURRENT_DEMERITS | EMPLOYEE_COUNT | INSPECTION_DEMERITS | NUMBER_OF_VIOLATIONS | VIOLATIONS_RAW | |
|---|---|---|---|---|---|---|---|
| count | 9566.000000 | 9566.000000 | 9566.000000 | 9566.000000 | 9566.000000 | 9566.000000 | 9566.000000 |
| mean | 0.301357 | 0.204614 | 0.048763 | 0.245525 | 0.157498 | 0.100599 | 0.074753 |
| std | 0.171173 | 0.141697 | 0.043200 | 0.182900 | 0.097699 | 0.109871 | 0.064070 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 0.222222 | 0.111111 | 0.000000 | 0.102041 | 0.093023 | 0.037037 | 0.045455 |
| 50% | 0.333333 | 0.222222 | 0.050000 | 0.224490 | 0.116279 | 0.074074 | 0.090909 |
| 75% | 0.444444 | 0.333333 | 0.080000 | 0.367347 | 0.220930 | 0.148148 | 0.090909 |
| max | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
# train a logistic regression model on the training set
from sklearn.linear_model import LogisticRegression
# instantiate the model
logreg = LogisticRegression(solver='liblinear', random_state=0)
# fit the model
logreg.fit(X_train, y_train)
LogisticRegression(random_state=0, solver='liblinear')
y_pred_test = logreg.predict(X_test)
y_pred_test
array([0, 0, 0, ..., 0, 0, 0])
# probability of getting output as 0 - no rain
logreg.predict_proba(X_test)[:,0]
array([0.82664296, 0.80712496, 0.85529621, ..., 0.85130164, 0.81271582,
0.83424862])
# probability of getting output as 1 - rain
logreg.predict_proba(X_test)[:,1]
array([0.17335704, 0.19287504, 0.14470379, ..., 0.14869836, 0.18728418,
0.16575138])
from sklearn.metrics import accuracy_score
print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_test)))
Model accuracy score: 0.8407
y_pred_train = logreg.predict(X_train)
y_pred_train
array([0, 0, 0, ..., 0, 0, 0])
print('Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))
Training-set accuracy score: 0.8409
# print the scores on training and test set
print('Training set score: {:.4f}'.format(logreg.score(X_train, y_train)))
print('Test set score: {:.4f}'.format(logreg.score(X_test, y_test)))
Training set score: 0.8409 Test set score: 0.8407
# Print the Confusion Matrix and slice it into four pieces
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred_test)
print('Confusion matrix\n\n', cm)
print('\nTrue Positives(TP) = ', cm[0,0])
print('\nTrue Negatives(TN) = ', cm[1,1])
print('\nFalse Positives(FP) = ', cm[0,1])
print('\nFalse Negatives(FN) = ', cm[1,0])
Confusion matrix [[2011 0] [ 381 0]] True Positives(TP) = 2011 True Negatives(TN) = 0 False Positives(FP) = 0 False Negatives(FN) = 381
# visualize confusion matrix with seaborn heatmap
cm_matrix = pd.DataFrame(data=cm, columns=['Actual Positive:1', 'Actual Negative:0'],
index=['Predict Positive:1', 'Predict Negative:0'])
sns.heatmap(cm_matrix, annot=True, fmt='d', cmap='YlGnBu')
<matplotlib.axes._subplots.AxesSubplot at 0x7f9d806714c0>
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_test))
precision recall f1-score support
0 0.84 1.00 0.91 2011
1 0.00 0.00 0.00 381
accuracy 0.84 2392
macro avg 0.42 0.50 0.46 2392
weighted avg 0.71 0.84 0.77 2392
TP = cm[0,0]
TN = cm[1,1]
FP = cm[0,1]
FN = cm[1,0]
# print classification accuracy
classification_accuracy = (TP + TN) / float(TP + TN + FP + FN)
print('Classification accuracy : {0:0.4f}'.format(classification_accuracy))
Classification accuracy : 0.8407
# print classification error
classification_error = (FP + FN) / float(TP + TN + FP + FN)
print('Classification error : {0:0.4f}'.format(classification_error))
Classification error : 0.1593
# print precision score
precision = TP / float(TP + FP)
print('Precision : {0:0.4f}'.format(precision))
Precision : 1.0000
recall = TP / float(TP + FN)
print('Recall or Sensitivity : {0:0.4f}'.format(recall))
Recall or Sensitivity : 0.8407
true_positive_rate = TP / float(TP + FN)
print('True Positive Rate : {0:0.4f}'.format(true_positive_rate))
True Positive Rate : 0.8407
false_positive_rate = FP / float(FP + TN)
print('False Positive Rate : {0:0.4f}'.format(false_positive_rate))
False Positive Rate : nan
specificity = TN / (TN + FP)
print('Specificity : {0:0.4f}'.format(specificity))
Specificity : nan
# import pandas as pd
# import numpy as np
# import requests
# import zipfile
# import io
# from sklearn.ensemble import RandomForestRegressor
# from evidently.dashboard import Dashboard
# from evidently import ColumnMapping
# from evidently.tabs import RegressionPerformanceTab
# from evidently.model_profile import Profile
# from evidently.profile_sections import RegressionPerformanceProfileSection
# #target = 'NEXT_INSPECTION_GRADE_C_OR_BELOW' #using NEXT_INSPECTION_GRADE_C_OR_BELOW as target gives error
# target = 'CURRENT_DEMERITS' #using NEXT_INSPECTION_GRADE_C_OR_BELOW as target gives error
# # numerical_features = ['CURRENT_DEMERITS', 'INSPECTION_DEMERITS', 'VIOLATIONS_RAW',
# # 'FIRST_VIOLATION', 'SECOND_VIOLATION','THIRD_VIOLATION',
# # 'NUMBER_OF_VIOLATIONS','NUMBER_OF_VIOLATIONS','EMPLOYEE_COUNT',
# # 'MEDIAN_EMPLOYEE_AGE','MEDIAN_EMPLOYEE_TENURE']
# numerical_features = [ 'INSPECTION_DEMERITS', 'VIOLATIONS_RAW',
# 'FIRST_VIOLATION', 'SECOND_VIOLATION','THIRD_VIOLATION',
# 'NUMBER_OF_VIOLATIONS','NUMBER_OF_VIOLATIONS','EMPLOYEE_COUNT',
# 'MEDIAN_EMPLOYEE_AGE','MEDIAN_EMPLOYEE_TENURE']
# categorical_features = ['CURRENT_GRADE_ENCODED', 'INSPECTION_TYPE_ENCODED',
# 'FIRST_VIOLATION_TYPE_ENCODED', 'SECOND_VIOLATION_TYPE_ENCODED',
# 'THIRD_VIOLATION_TYPE_ENCODED',]
# features = numerical_features + categorical_features
# model = RandomForestRegressor(random_state = 0)
# model.fit(ref_data[features], ref_data[target])
# ref_data['prediction'] = model.predict(ref_data[features])
# prod_data['prediction'] = model.predict(prod_data[features])
# df=df[['CURRENT_DEMERITS', 'INSPECTION_DEMERITS', 'VIOLATIONS_RAW',
# 'FIRST_VIOLATION', 'SECOND_VIOLATION', 'THIRD_VIOLATION',
# 'NUMBER_OF_VIOLATIONS', 'EMPLOYEE_COUNT', 'MEDIAN_EMPLOYEE_AGE',
# 'MEDIAN_EMPLOYEE_TENURE', 'CURRENT_GRADE_ENCODED',
# 'INSPECTION_TYPE_ENCODED', 'FIRST_VIOLATION_TYPE_ENCODED',
# 'SECOND_VIOLATION_TYPE_ENCODED', 'THIRD_VIOLATION_TYPE_ENCODED','NEXT_INSPECTION_GRADE_C_OR_BELOW']]
# ref_data = df
# # prod_data = drift_frame
# column_mapping = ColumnMapping(target,
# 'prediction',
# numerical_features=numerical_features,
# categorical_features=categorical_features)
# dashboard = Dashboard(tabs=[RegressionPerformanceTab()])
# dashboard.calculate(ref_data, prod_data, column_mapping=column_mapping)
# dashboard.show()
# drift_regression_performance_profile = Profile(sections=[RegressionPerformanceProfileSection()])
# drift_regression_performance_profile.calculate(ref_data, prod_data, column_mapping=column_mapping)
# regression_profile = drift_regression_performance_profile.json()
# regression_profile
# import pandas as pd
# from sklearn import datasets
# from evidently.dashboard import Dashboard
# from evidently.pipeline.column_mapping import ColumnMapping
# from evidently.tabs import DataDriftTab, NumTargetDriftTab
# from evidently.model_profile import Profile
# from evidently.profile_sections import DataDriftProfileSection, NumTargetDriftProfileSection
# column_mapping = ColumnMapping()
# column_mapping.target = 'NEXT_INSPECTION_GRADE_C_OR_BELOW'
# column_mapping.prediction = None
# column_mapping.datetime = None
# column_mapping.numerical_features = ['CURRENT_DEMERITS', 'INSPECTION_DEMERITS', 'VIOLATIONS_RAW',
# 'FIRST_VIOLATION', 'SECOND_VIOLATION','THIRD_VIOLATION',
# 'NUMBER_OF_VIOLATIONS','NUMBER_OF_VIOLATIONS','EMPLOYEE_COUNT',
# 'MEDIAN_EMPLOYEE_AGE','MEDIAN_EMPLOYEE_TENURE']
# column_mapping.categorical_features = ['CURRENT_GRADE_ENCODED', 'INSPECTION_TYPE_ENCODED',
# 'FIRST_VIOLATION_TYPE_ENCODED', 'SECOND_VIOLATION_TYPE_ENCODED',
# 'THIRD_VIOLATION_TYPE_ENCODED',]
# food_data_and_target_drift_dashboard = Dashboard(tabs=[DataDriftTab(), NumTargetDriftTab()])
# food_data_and_target_drift_dashboard.calculate(df[:1000], df[1000:],
# column_mapping = column_mapping)
# food_data_and_target_drift_dashboard.show()
# food_target_and_data_drift_profile = Profile(sections=[DataDriftProfileSection(), NumTargetDriftProfileSection()])
# food_target_and_data_drift_profile.calculate(df[:1000], df[1000:],
# column_mapping = column_mapping)
# food_target_and_data_drift_profile.json()